In [70]:
#importing the libraries and reading the data
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

pd.read_csv('spotify_Song_Dataset/dataset.csv')

df = pd.read_csv('spotify_Song_Dataset/dataset.csv')


In [71]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [72]:
#removing the row with null values
df = df.dropna()

#dropping duration and explicit columns 
df = df.drop(['duration_ms','explicit','mode','liveness','loudness','time_signature','key'],axis=1)

df.rename(columns={'Unnamed: 0': 'song_id'}, inplace=True)


df.head(5)


Unnamed: 0,song_id,track_id,artists,album_name,track_name,popularity,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,0.676,0.461,0.143,0.0322,1e-06,0.715,87.917,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,0.42,0.166,0.0763,0.924,6e-06,0.267,77.489,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,0.438,0.359,0.0557,0.21,0.0,0.12,76.332,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,0.266,0.0596,0.0363,0.905,7.1e-05,0.143,181.74,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,0.618,0.443,0.0526,0.469,0.0,0.167,119.949,acoustic


In [73]:
#user inputs from the website 
user_genre = 'acoustic' #temporary default input
user_valence = 0.7      #temporary default input
user_song_id = 0        #temporary default input


#fileter dataset based on user preferece from genre and valence dataset 
filtered_df = df[(df['track_genre'] == user_genre) & (df['valence'] > user_valence)]

#features for similarity calculation
features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'tempo']

#normalizing numerical features using .loc
scaler = MinMaxScaler()
filtered_df.loc[:, features] = scaler.fit_transform(filtered_df[features])

#calculating similarity matrix 
similarity_matrix = cosine_similarity(filtered_df[features])

#getting the index of chosen song 
user_song_index = filtered_df.index[filtered_df['song_id']== user_song_id].tolist()[0]

#getting similarities for the chosen song
similarities = similarity_matrix[user_song_index]

#creating a dataframe with similarities and track ID 
similar_songs_df = pd.DataFrame({'song_id': filtered_df['song_id'], 'similarity': similarities})

#sorting by similarity in descending order
similar_songs_df = similar_songs_df.sort_values(by='similarity', ascending=False)

#display top N similar songs 
top_n = 5
recommended_songs_df = similar_songs_df.head(top_n)


#printing songs
print("Recommended Songs: ")
for index, row in recommended_songs_df.iterrows():
    print(f"Song ID: {row['song_id']}, Similarity: {row['similarity']:0.4f}")

Recommended Songs: 
Song ID: 0.0, Similarity: 1.0000
Song ID: 850.0, Similarity: 0.9963
Song ID: 357.0, Similarity: 0.9870
Song ID: 488.0, Similarity: 0.9453
Song ID: 332.0, Similarity: 0.8808


In [75]:
train_df, test_df = train_test_split(filtered_df, test_size = 0.2, random_state = 42)

#defining columns for collaborative filtering
collab_columns = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'tempo']

#creating a user-item matrix for collaborative filtering
user_item_matrix = train_df.pivot_table(index='track_name', columns='song_id', values='valence').fillna(0)

#calculating cosine similarity between tracks
user_similarity_matrix = cosine_similarity(user_item_matrix)

#function to predict valnece based on user preferences
def predict_valence(user_preferences):

    #calculating similarity between input user preferences and all tracks training set
    user_similarities = cosine_similarity([user_preferences], user_item_matrix.values)

    #weighted average of valence scores based on user siilatities
    predicted_valence = user_similarities.dot(user_item_matrix.values) / user_similarities.sum()

    return predicted_valence.flatten()

# getting the index of the chosen song 
user_song_index = test_df.index[test_df['song_id'] == user_song_id].tolist()[0]

# example : user preferences for the choseen song
user_preferences = test_df.loc[user_song_index, collab_columns].values.reshape(1,-1)

#predict valence for the chosen song 
predicted_valence = predict_valence(user_preferences)

#evaluate the model using root mean squared error RMSE
actual_valence = test_df.loc[user_song_index, 'valence']
rmse = sqrt(mean_squared_error(actual_valence, predicted_valence))

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


Error: Song ID 0 not found in test_df.


NameError: name 'user_preferences' is not defined