In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm  # For progress bar
from collections import Counter

# Step 1: Load the CSV file
file_path = "../data/spotify_songs_with_popularity.csv"
df = pd.read_csv(file_path)

# Step 2: Drop unnecessary columns except 'track_name' and 'track_popularity'
columns_to_drop = [
    'track_id', 'track_artist', 'track_album_id',
       'track_album_name', 'track_album_release_date', 'playlist_name',
       'playlist_id', 'popularity',
       'Popularity_Category'
]

# print les colonnes catégorielles
print(df.select_dtypes(include=['object']).columns)

# Keep 'track_name' and 'track_popularity' for display and prediction
track_names = df['track_album_name']  # Extract track names
Popularity_Category = df['Popularity_Category']  # Extract popularity category
df = df.drop(columns=columns_to_drop)

# Step 3: Vectorize 'playlist_genre' and 'playlist_subgenre' (One-Hot Encoding)
def one_hot_encode(df, column):
    encoded = pd.get_dummies(df[column], prefix=column)
    return encoded

# Apply one-hot encoding
playlist_genre_encoded = one_hot_encode(df, "playlist_genre")
playlist_subgenre_encoded = one_hot_encode(df, "playlist_subgenre")

# Convert boolean values to integers (True -> 1, False -> 0)
playlist_genre_encoded = playlist_genre_encoded.astype(int)
playlist_subgenre_encoded = playlist_subgenre_encoded.astype(int)

# Combine the numerical features with the encoded categorical features
df = df.drop(columns=["playlist_genre", "playlist_subgenre"])
final_df = pd.concat([df, playlist_genre_encoded, playlist_subgenre_encoded], axis=1)

# Step 4: Normalize All Features (Min-Max Normalization)
def min_max_normalize(df):
    return (df - df.min()) / (df.max() - df.min())

numerical_columns = final_df.columns  # All remaining columns are numerical now
final_df[numerical_columns] = min_max_normalize(final_df[numerical_columns])

# Step 5: Define the Manual KNN Function
def euclidean_distance(a, b):
    """Compute the Euclidean distance between two points."""
    return np.sqrt(np.sum((a - b) ** 2))

def knn(data, query, k):
    """
    Perform manual K-Nearest Neighbors algorithm.

    Args:
        data (pd.DataFrame): The dataset without the query.
        query (pd.Series): A single row to classify.
        k (int): Number of neighbors to consider.

    Returns:
        list: Indices of k nearest neighbors.
    """
    distances = []
    for index, row in data.iterrows():
        distance = euclidean_distance(row.values, query.values)
        distances.append((index, distance))
    
    # Sort by distance and select the k-nearest neighbors
    distances.sort(key=lambda x: x[1])
    neighbors = [idx for idx, _ in distances[:k]]
    return neighbors

# Step 6: Test the KNN Algorithm
# Split the dataset: Use one row as a query and the rest as the dataset
query_row = final_df.iloc[0]  # Using the first row as a test query
remaining_data = final_df.drop(0)

k = 5  # Number of neighbors
neighbors_indices = knn(remaining_data, query_row, k)

# Step 7: Predict Popularity
# Fetch the popularity of the neighbors
neighbors_popularity = Popularity_Category.iloc[neighbors_indices]

# Predict the popularity of the query row
predicted_popularity = Counter(neighbors_popularity).most_common(1)[0][0]

# Print the results with track titles
print(f"The {k} nearest neighbors for the query row are:")
print(track_names.iloc[neighbors_indices])
print("\nPopularity of the nearest neighbors:")
print(neighbors_popularity)
print(f"\nPredicted Popularity: {predicted_popularity}")


Index(['track_id', 'track_name', 'track_artist', 'track_album_id',
       'track_album_name', 'track_album_release_date', 'playlist_name',
       'playlist_id', 'playlist_genre', 'playlist_subgenre', 'popularity',
       'Popularity_Category'],
      dtype='object')


  df = pd.read_csv(file_path)


TypeError: unsupported operand type(s) for -: 'str' and 'str'