In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


## NDCG Test Items:

In [3]:
def calculate_ndcg(recommended_sim_list):
    
    # Ideally our list is ranked from most to least relevent 
    ideal_rel = np.asarray([[4, 3, 2, 1, 0]])
    total_ndcg_score = 0
    
    for recommended_sim_songs in recommended_sim_list:
        scores = np.asarray([recommended_sim_songs['true_relevence']])
        total_ndcg_score += ndcg_score(ideal_rel, scores)
        
    return total_ndcg_score

In [4]:
# Test Data 1 - Input Song is Street Lights from Kanye West

inputSong1 = data.loc[data['id'] == "6j8gTlbhj9KJSeypNcNAS9"]
inputSong1 = inputSong1[['valence', 'acousticness', 'danceability', 'artists', 'energy', 'explicit', 'instrumentalness', 'liveness', 'mode', 'speechiness', 'name']]

# The "database" that the System will choose from
# Intentionally choosing 5 songs that are "similarish" to the input song

ModelPlaylist1 = data.loc[data['id'].isin(["6j8gTlbhj9KJSeypNcNAS9", 
                                          "7Cu2COdH93MnuireuKNiS3", 
                                          "02LAK7qT1wya0klSeNO96f", 
                                          "4jQqM4NI79HEcWHUJb8Hvf", 
                                          "4cAgkb0ifwn0FSHGXnr4F6"])]

ModelPlaylist1 = ModelPlaylist1[['valence', 'acousticness', 'danceability', 'artists', 'energy', 'explicit', 'instrumentalness', 'liveness', 'mode', 'speechiness', 'name']]
ModelPlaylist1['true_relevence'] = [1,3,4,0,2] #Higher score means more relevence

ModelPlaylist1.head()

Unnamed: 0,valence,acousticness,danceability,artists,energy,explicit,instrumentalness,liveness,mode,speechiness,name,true_relevence
17048,0.124,0.141,0.542,['Kanye West'],0.466,1,0.000445,0.125,1,0.0831,I Wonder,1
18725,0.142,0.721,0.41,['Daniel Caesar'],0.298,0,0.0,0.158,1,0.0473,Streetcar,3
36134,0.467,0.0287,0.644,['Kanye West'],0.487,0,0.0166,0.206,1,0.034,Street Lights,4
37189,0.359,0.165,0.506,['Kanye West'],0.59,1,1.1e-05,0.096,1,0.076,New Slaves,0
54820,0.407,0.157,0.698,['Kanye West'],0.447,0,0.00564,0.405,0,0.102,RoboCop,2


In [5]:
# Test Data 2 - Input Song is Street Lights from Kanye West
# This is just filler, need to obviously pick something different from Test Data 1

inputSong2 = data.loc[data['id'] == "6j8gTlbhj9KJSeypNcNAS9"]
inputSong2 = inputSong2[['valence', 'acousticness', 'danceability', 'artists', 'energy', 'explicit', 'instrumentalness', 'liveness', 'mode', 'speechiness', 'name']]

# The "database" that the System will choose from
# Intentionally choosing 5 songs that are "similarish" to the input song

ModelPlaylist2 = data.loc[data['id'].isin(["6j8gTlbhj9KJSeypNcNAS9", 
                                          "7Cu2COdH93MnuireuKNiS3", 
                                          "02LAK7qT1wya0klSeNO96f", 
                                          "4jQqM4NI79HEcWHUJb8Hvf", 
                                          "4cAgkb0ifwn0FSHGXnr4F6"])]

ModelPlaylist2 = ModelPlaylist2[['valence', 'acousticness', 'danceability', 'artists', 'energy', 'explicit', 'instrumentalness', 'liveness', 'mode', 'speechiness', 'name']]
ModelPlaylist2['true_relevence'] = [1,3,4,0,2] #Higher score means more relevence

ModelPlaylist2.head()

Unnamed: 0,valence,acousticness,danceability,artists,energy,explicit,instrumentalness,liveness,mode,speechiness,name,true_relevence
17048,0.124,0.141,0.542,['Kanye West'],0.466,1,0.000445,0.125,1,0.0831,I Wonder,1
18725,0.142,0.721,0.41,['Daniel Caesar'],0.298,0,0.0,0.158,1,0.0473,Streetcar,3
36134,0.467,0.0287,0.644,['Kanye West'],0.487,0,0.0166,0.206,1,0.034,Street Lights,4
37189,0.359,0.165,0.506,['Kanye West'],0.59,1,1.1e-05,0.096,1,0.076,New Slaves,0
54820,0.407,0.157,0.698,['Kanye West'],0.447,0,0.00564,0.405,0,0.102,RoboCop,2


## Getting Recommendations (Cosine Similarity)

In [6]:
def get_recommendations_basic(inputSong, ModelPlaylist):
    # Not changing the raw data. Just using most of the given numerical columns (look at the original "data" variable)

    recommended_sim = ModelPlaylist.copy()
    nparray_sim = cosine_similarity(inputSong.drop(['name', 'artists'], axis = 1).values, ModelPlaylist.drop(['name', 'artists', "true_relevence"], axis = 1).values)

    # Ordering the recommended songs from most to least
    recommended_sim['sim'] = nparray_sim.tolist()[0]
    recommended_sim_top5 = recommended_sim.sort_values('sim',ascending = False)
    
    return recommended_sim_top5

In [7]:
def get_recommendations_normalized(inputSong, ModelPlaylist):
    # Normalize the data
    
    return "idk"

In [8]:
# Get the recommended songs for our test data
song1_recommended_basic = get_recommendations_basic(inputSong1, ModelPlaylist1)
song2_recommended_basic = get_recommendations_basic(inputSong2, ModelPlaylist2)

# So I'm thinking we can make multiple "get_recommendation" functions, for different reasons (standardization, OHE, etc)
# This means we'll need to do pre-processing or feature engineering in those functions
# Then using our NDCG score, we can determine whats the best method

In [9]:
calculate_ndcg([song1_recommended_basic])

0.9691000967123548