# RECOMMENDER SYSTEM SPOTIFY

https://www.geeksforgeeks.org/music-recommendation-system-using-machine-learning/#



In [None]:
#pip install pandas openpyxl

In [None]:
# Load general utilities
# ----------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import ast

# Machine Learning Packages
# ----------------------
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib
from scipy.sparse import csr_matrix

# ----------------------
import warnings
warnings.filterwarnings('ignore')

In [None]:
df_artists = pd.read_csv("artists.csv")

# Filter out rows where genres is "[]"
df_artists = df_artists[df_artists['genres'] != "[]"]
df_artists.head(5)

In [None]:
# # Load the Excel file
# df_music = pd.read_excel('tracks_converted.xlsx', engine='openpyxl')
# df_music.head()

In [None]:
df_music = pd.read_csv("tracks.csv")
df_music.head()

#### Join Artist Genres info into Track on artist's ID

In [None]:
# First, normalize the dataframes if not already done
# Assuming the ids in df_music['id_artists'] are stored as strings
df_music['id_artists'] = df_music['id_artists'].apply(ast.literal_eval)

# Set the 'id' column in df_artists as the index for easy lookup
df_artists.set_index('id', inplace=True)

# Use the explode function to create individual rows for each artist id
df_music_exploded = df_music.explode('id_artists')

# Merge the exploded df_music with df_artists on the artist ids
merged_df = df_music_exploded.merge(df_artists[['genres']], 
                                    left_on='id_artists', 
                                    right_index=True, 
                                    how='left')

# Fill NaN with empty list
merged_df['genres'] = merged_df['genres'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))

# Group by the original index of df_music and aggregate the genres
final_genres = merged_df.groupby(merged_df.index)['genres'].agg(sum)

# Ensure that genres in final_genres are unique
final_genres = final_genres.apply(lambda x: list(set(x)))

# Assign the aggregated genres back to the original df_music
df_music['genres'] = final_genres

In [None]:
df_music.head()

In [None]:
# modification to release date
# Convert release_date to string type
df_music['release_date'] = df_music['release_date'].str.split(' ').str[0]

# Initialize new columns with NaN values
df_music['release_year'] = pd.np.nan
df_music['release_month'] = pd.np.nan
df_music['release_day'] = pd.np.nan

# Split and assign based on the length of the split
for idx, date_str in enumerate(df_music['release_date']):
    parts = date_str.split('-')
    
    if len(parts) == 1:
        df_music.at[idx, 'release_year'] = parts[0]
    elif len(parts) == 2:
        df_music.at[idx, 'release_year'] = parts[0]
        df_music.at[idx, 'release_month'] = parts[1]
    elif len(parts) == 3:
        df_music.at[idx, 'release_year'] = parts[0]
        df_music.at[idx, 'release_month'] = parts[1]
        df_music.at[idx, 'release_day'] = parts[2]

# Convert the newly created columns to the appropriate data type
df_music['release_year'] = df_music['release_year'].astype(int)
df_music['release_month'] = df_music['release_month'].fillna(-1).astype(int)
df_music['release_day'] = df_music['release_day'].fillna(-1).astype(int)

In [None]:
df_music.shape

In [None]:
df_music.head()

In [None]:
df_music.info()

In [None]:
df_music.isnull().sum()

In [None]:
# Drop rows where have NaN values
df_music.dropna(inplace = True)

# Drop rows where 'release_month' or 'release_day' have a value of -1
df_music = df_music[~((df_music['release_month'] == -1) | (df_music['release_day'] == -1) | (df_music['key'] == -1))]

# Drop rows where genres is an empty list
df_music = df_music[df_music['genres'].apply(lambda x: bool(x))]

In [None]:
df_music = df_music.sort_values(by=['release_year','release_month','release_day'], ascending=False)

In [None]:
df_music.drop_duplicates(subset=['name'], keep='first', inplace=True)

In [None]:
df_music.drop('release_date', axis=1, inplace=True)

In [None]:
df_music.shape

In [None]:
# Group by release_year and count
yearly_counts = df_music.groupby('release_year').size()

# Plot
plt.figure(figsize=(20,6))
yearly_counts.plot(kind='bar')
plt.title('Number of Pieces of Music Released Each Year')
plt.xlabel('Year')
plt.ylabel('Number of Pieces')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Data Preprocessing

In [None]:
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# One-hot encode the 'genres' column
df_music_genres = df_music.join(pd.DataFrame(mlb.fit_transform(df_music.pop('genres')),
                                             columns=mlb.classes_,
                                             index=df_music.index))

# Show the new dataframe with one-hot encoded genres
df_music_genres.head()

## Feature Engineering

In [None]:
# Choose song attributes for the recommendation engine
recommendation_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                           'speechiness', 'acousticness', 'instrumentalness',
                           'liveness', 'valence', 'tempo', 'time_signature']

# Select these features from the dataframe
df_recommendation_features = df_music_genres[['id'] + recommendation_features + list(mlb.classes_)]

# Show the feature set for the recommendation engine
df_recommendation_features.head()

In [None]:
# Assuming 'followers' is a column in df_artists and it has been merged with df_music
# Feature Engineering for the popularity prediction model
popularity_features = [
    'followers', 'popularity', 'danceability', 'energy', 'key', 'loudness', 
    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
    'valence', 'tempo', 'time_signature'
]

# Explode 'id_artists' column to have one artist ID per row
df_music_exploded = df_music.explode('id_artists')

# Merge the 'followers' column from df_artists into df_music_exploded
df_music_with_followers = df_music_exploded.merge(df_artists[['followers']], left_on='id_artists', right_index=True, how='left')

# Since we have exploded the df_music, there might be duplicated tracks with different artist IDs,
# we need to drop these duplicates to revert to the original track structure
# We can do this by dropping duplicates based on the track's 'id'
df_music_with_followers = df_music_with_followers.drop_duplicates(subset='id')

# Now that 'followers' is part of df_music, we can select the popularity features
df_popularity_features = df_music_with_followers[popularity_features]

# Show the feature set for the popularity prediction model
df_popularity_features.head()

## Recommender Model and Popularity Prediction

### Step 1: Split the Data for Popularity Prediction Model

In [None]:
# Splitting the features and target variable
X = df_popularity_features.drop('popularity', axis=1)
y = df_popularity_features['popularity']

# Splitting the data into train+validation and test sets (80-20)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the train+validation into train and validation sets (90-10)
X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, test_size=0.1, random_state=42)

In [None]:
# Check for infinite values and replace them with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
y_train = y_train.replace([np.inf, -np.inf], np.nan)

# Handle missing values (which now includes the previously infinite values)
X_train = X_train.fillna(X_train.mean())
y_train = y_train.fillna(y_train.mean())

In [None]:
# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
X_validation_imputed = imputer.transform(X_validation)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
X_validation_scaled = scaler.transform(X_validation_imputed)  

# Check if there are any remaining non-finite values
if not np.all(np.isfinite(X_train_scaled)):
    raise ValueError("All values in X_train_scaled must be finite.")
if not np.all(np.isfinite(X_test_scaled)):
    raise ValueError("All values in X_test_scaled must be finite.")

### Step 2: Train Multiple ML Algorithms

In [None]:
# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Define the parameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Define the parameter grid for Support Vector Regression
svr_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

# Create the models
models = {
    'RandomForestRegressor': (RandomForestRegressor(random_state=42), rf_param_grid),
    'GradientBoostingRegressor': (GradientBoostingRegressor(random_state=42), gb_param_grid),
    'SVR': (SVR(), svr_param_grid)
}

# Dictionary to store the best models and best parameters for each algorithm
best_models = {}
best_params = {}
best_scores = {}

# Perform Grid Search with Cross-Validation
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    # Store the best model, parameters, and score
    best_models[model_name] = grid_search.best_estimator_
    best_params[model_name] = grid_search.best_params_
    best_scores[model_name] = -grid_search.best_score_
    
    # Validate the best model
    y_validation_predictions = best_models[model_name].predict(X_validation_scaled)
    rmse = mean_squared_error(y_validation, y_validation_predictions, squared=False)
    print(f"Validation RMSE for {model_name}: {rmse}")

In [None]:
# # Instantiate the model
# rf_model = RandomForestRegressor(random_state=42)

# # Fit the model on the training data
# rf_model.fit(X_train, y_train)

### Step 4: Save Best Model

In [None]:
# Find the best model overall based on validation RMSE
best_model_name = min(best_scores, key=best_scores.get)
best_model = best_models[best_model_name]

print(f"The best model is {best_model_name} with a validation RMSE of {best_scores[best_model_name]}")

# Save the best model to a file
joblib.dump(best_model, 'best_model.joblib')

In [None]:
# # When making predictions, make sure to use the scaled validation data
# y_validation_predictions = rf_model.predict(X_validation_scaled)

# # Now calculate the RMSE using the predictions and the actual y_validation values
# rmse = mean_squared_error(y_validation, y_validation_predictions, squared=False)
# print(f"Validation RMSE: {rmse}")

### Step 4: Develop the Recommender System

In [None]:
features_for_csr = df_recommendation_features.drop('id', axis=1).sparse.to_coo().tocsr()

In [None]:
# Function to recommend songs based on a given song ID
def recommend_songs(song_id, df, csr_data, number_of_songs=5):
    
    # Find the row of the given song ID
    index = df.index[df['id'] == song_id].tolist()[0]

    # Calculate similarity scores for the specific song against all others
    similarity_scores = cosine_similarity(csr_data[index:index+1], csr_data).flatten()

    # Get the indices of the top songs, skipping the first one since it's the song itself
    top_indices = similarity_scores.argsort()[-number_of_songs-1:-1][::-1]

    # Get the song indices
    song_indices = df.iloc[top_indices].index

    # Return the most similar songs
    return df['id'].iloc[song_indices]

In [None]:
# # Function to recommend songs based on a given song ID
# def recommend_songs(song_id, df, csr_data, number_of_songs=5):
#     # Find the row of the given song ID
#     index = df.index[df['id'] == song_id].tolist()[0]

#     # Calculate similarity scores for the specific song against all others
#     similarity_scores = cosine_similarity(csr_data[index], csr_data).flatten()

#     # Get the indices of the top songs, skipping the first one since it's the song itself
#     top_indices = similarity_scores.argsort()[-number_of_songs-1:-1][::-1]

#     # Get the song indices
#     song_indices = df.iloc[top_indices].index

#     # Return the most similar songs
#     return df['id'].iloc[song_indices]

In [None]:
# Example usage
recommended_song_ids = recommend_songs(some_song_id, df_recommendation_features, csr_data, 5)
print(song_recommendations)