In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import os
import numpy as np

In [5]:
os.getcwd()

'/Users/bipinkarki/Desktop/ML-ABKS-Music-Recommender-System/Notebooks'

In [6]:
tracks = pd.read_csv(r"datasets/data.csv")

In [7]:
tracks.tail(30)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
170623,0.558,2020,0.175,['Brent Faiyaz'],0.743,215058,0.548,0,0dJhXJKZTpiaxTUc7uItIN,0.00018,6,0.088,-6.545,0,Let Me Know,65,2020-02-07,0.0947,85.017
170624,0.697,2020,0.0529,['Ozuna'],0.735,217240,0.791,0,5Jm4w8jmPEBTLjI9vH4fXo,0.000146,6,0.145,-4.462,1,Caramelo,80,2020-09-04,0.0679,168.087
170625,0.77,2020,0.0358,"['Quinn XCII', 'Logic']",0.822,212013,0.834,1,73681b7Hc0THHGZrSzl75y,0.0,6,0.0461,-2.879,0,A Letter To My Younger Self (feat. Logic),65,2020-07-10,0.0663,115.006
170626,0.155,2020,0.0627,['Cookiee Kawaii'],0.778,84000,0.576,1,4ZAQiu61otvHVveuTsPAUr,0.0044,10,0.116,-8.698,1,Vibe (If I Back It Up),68,2020-08-28,0.301,80.027
170627,0.629,2020,0.41,"['Rauw Alejandro', 'Myke Towers', 'Sky Rompien...",0.809,184817,0.624,1,10u2iOS0r6plfb9GknD7g4,0.0,8,0.108,-4.963,0,Ponte Pa' Mi,74,2020-04-16,0.155,85.011
170628,0.323,2020,0.000206,['Miley Cyrus'],0.407,290768,0.872,0,6IsiCdn42x5fGWTUqkyDwj,0.0451,7,0.127,-5.001,1,Zombie (Live from the NIVA Save Our Stages Fes...,71,2020-10-23,0.0354,164.0
170629,0.691,2020,0.0406,"['Rauw Alejandro', 'Camilo']",0.721,222680,0.637,0,4G3PTss3mU33Tau7t4KbwE,0.0,9,0.0525,-5.134,1,Tattoo - Remix with Camilo,71,2020-11-13,0.286,193.922
170630,0.659,2020,0.00248,"['DVBBS', 'Quinn XCII']",0.714,150819,0.798,0,4fCE1SCDulouoac9Q8XroU,0.000678,5,0.103,-4.239,0,West Coast (feat. Quinn XCII),67,2020-10-16,0.0612,119.017
170631,0.327,2020,0.217,['YoungBoy Never Broke Again'],0.485,156077,0.814,1,3qHgGyJY4GpXNOK4WL4NSo,0.0,9,0.112,-3.907,1,Red Eye,63,2020-02-21,0.138,159.894
170632,0.738,2020,0.16,"['Rauw Alejandro', 'Dalex', 'Lenny Tavárez', '...",0.788,325692,0.629,1,3oCghu20Kto5Xcu8Pg0cit,0.000828,0,0.0598,-4.727,1,"Elegí (feat. Farruko, Anuel AA, Sech, Dímelo F...",75,2020-08-28,0.0883,172.044


In [8]:
tfidf = TfidfVectorizer( stop_words='english')

#Replace NaN with an empty string
tracks['name'] = tracks['name'].fillna('')

tracks = tracks.head(25000)

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(tracks['name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(25000, 15701)

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:

indices = pd.Series(tracks.index, index=tracks['name']).drop_duplicates()


In [11]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    track_indices = [i[0] for i in sim_scores]
    print(tracks['name'].iloc[track_indices].tolist())
    return tracks['id'].iloc[track_indices].tolist()

In [12]:
get_recommendations("I Might Fall Back On You")

['Fall Back Down', 'Fall For You', 'I Might Fall Back On You', 'When I Fall In Love', 'When I Fall In Love', 'When I Fall In Love', 'When I Fall In Love', 'If I Ever Fall In Love', 'If I Ever Fall In Love', 'I Could Fall In Love']


['3pdHJCTk85ls2SGGXIJ7XH',
 '7lWF2mVr1KKbVnaT2nSlPo',
 '6MMF38qO03XJ7puIxVl8Gg',
 '6s6h2XK7Nl8lEcTzr7ezeB',
 '0w7mhiHnM0rhdu3l5mDDYl',
 '1Ame30MZ6yKnqaWFAch1m7',
 '1gMMY095TNREZ8ZZYsfM5L',
 '1jCsCYgzQQHk3bDJDuFbNi',
 '0uOPGU4CbYxzFxn6T7sblW',
 '3ACQkIMv6nGYHRpPGyS4BK']

In [13]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [14]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [17]:
features = ['artists','popularity', 'instrumentalness']
#features = ['artists']
for feature in features:
    tracks[feature] = tracks[feature].apply(clean_data)
#     tracks[feature] = tracks[feature].apply(eval)

In [18]:
print(tracks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   valence           25000 non-null  float64
 1   year              25000 non-null  int64  
 2   acousticness      25000 non-null  float64
 3   artists           25000 non-null  object 
 4   danceability      25000 non-null  float64
 5   duration_ms       25000 non-null  int64  
 6   energy            25000 non-null  float64
 7   explicit          25000 non-null  int64  
 8   id                25000 non-null  object 
 9   instrumentalness  25000 non-null  object 
 10  key               25000 non-null  int64  
 11  liveness          25000 non-null  float64
 12  loudness          25000 non-null  float64
 13  mode              25000 non-null  int64  
 14  name              25000 non-null  object 
 15  popularity        25000 non-null  object 
 16  release_date      25000 non-null  object

In [19]:
tracks.head(3)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['sergeirachmaninoff','jameslevine','berlinerp...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",,1921,0.0366,80.954
1,0.963,1921,0.732,['dennisday'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,,7,0.16,-12.441,1,Clancy Lowered the Boom,,1921,0.415,60.936
2,0.0394,1921,0.961,['khpkridhamardawakaratonngayogyakartahadining...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,,3,0.101,-14.85,1,Gati Bali,,1921,0.0339,110.339


In [20]:
from numpy.linalg import linalg


def cos_sim(a,b):
    dot_product = np.dot(a,b)
    norm_a = linalg.norm(a)
    norm_b = linalg.norm(b)
    return dot_product/(norm_a*norm_b)

In [21]:
from collections import defaultdict
from scipy.spatial.distance import cdist
import difflib
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [22]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=2))],verbose=True)
X = tracks.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
tracks['cluster_label'] = song_cluster_labels

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
Initialization complete
Iteration 0, inertia 176270.20650266414
Iteration 1, inertia 133552.76962467455
Iteration 2, inertia 128967.11992463979
Iteration 3, inertia 126675.46534269706
Iteration 4, inertia 125072.18414137364
Iteration 5, inertia 124172.42212293122
Iteration 6, inertia 123663.75042075673
Iteration 7, inertia 123354.05167036012
Iteration 8, inertia 123133.85349776459
Iteration 9, inertia 122936.51259587615
Iteration 10, inertia 122745.83870821742
Iteration 11, inertia 122576.93691775124
Iteration 12, inertia 122428.57257979031
Iteration 13, inertia 122312.7562775351
Iteration 14, inertia 122231.81524825124
Iteration 15, inertia 122183.51833543256
Iteration 16, inertia 122140.12358507517
Iteration 17, inertia 122106.27520855714
Iteration 18, inertia 122079.31449988749
Iteration 19, inertia 122059.68225036326
Iteration 20, inertia 122045.27967632553
Iteration 21, inertia 122035.35717308304
Iteration 22, 

In [23]:
tracks.to_csv('tracks_with_cluster.csv')

In [24]:
tracks.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,cluster_label
0,0.0594,1921,0.982,"['sergeirachmaninoff','jameslevine','berlinerp...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",,1921,0.0366,80.954,13
1,0.963,1921,0.732,['dennisday'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,,7,0.16,-12.441,1,Clancy Lowered the Boom,,1921,0.415,60.936,19
2,0.0394,1921,0.961,['khpkridhamardawakaratonngayogyakartahadining...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,,3,0.101,-14.85,1,Gati Bali,,1921,0.0339,110.339,13
3,0.165,1921,0.967,['frankparker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,,5,0.381,-9.316,1,Danny Boy,,1921,0.0354,100.109,6
4,0.253,1921,0.957,['philregan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,,3,0.229,-10.096,1,When Irish Eyes Are Smiling,,1921,0.038,101.665,6


In [25]:
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = tracks['name']
projection['cluster'] = tracks['cluster_label']

In [26]:
import plotly.express as px
fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

In [27]:
number_cols = ['time_signature','duration_ms','popularity','valence', 'acousticness', 'danceability',  'energy', 'instrumentalness', 'explicit','key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']

In [38]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="677a46ed628944af94b5cdb96e3e25ee",
                                                           client_secret="dfc3a58da7e34b009001ff1794592060"))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)


In [39]:
def get_song_data(song, spotify_data):
    
    """
    Gets the song data for a specific song. The song argument takes the form of a dictionary with 
    key-value pairs for the name and release year of the song.
    """
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) ].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'])

In [40]:
def get_mean_vector(song_list, spotify_data):
  
    """
    Gets the mean vector for a list of songs.
    """
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [41]:
def flatten_dict_list(dict_list):
   
    """
    Utility function for flattening a list of dictionaries.
    """
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

In [55]:
def recommend_songs(song_id, n_songs=10):
  
    """
    Recommends songs based on a list of previous songs that a user has listened to.
    """
    spotify_data = tracks
    song_name = tracks.loc[tracks.get("id") == song_id, 'name']
    print(song_name)
    song_list = [{'name': song_name}]
    print(song_list)
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    print(scaler)
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs['id'].tolist()

In [56]:
recommend_songs("6MMF38qO03XJ7puIxVl8Gg")


6MMF38qO03XJ7puIxVl8Gg
       valence  year  acousticness  \
0       0.0594  1921         0.982   
1       0.9630  1921         0.732   
2       0.0394  1921         0.961   
3       0.1650  1921         0.967   
4       0.2530  1921         0.957   
...        ...   ...           ...   
24995   0.1720  1951         0.950   
24996   0.2740  1951         0.986   
24997   0.0540  1951         0.994   
24998   0.5380  1951         0.680   
24999   0.6230  1951         0.864   

                                                 artists  danceability  \
0      ['sergeirachmaninoff','jameslevine','berlinerp...         0.279   
1                                          ['dennisday']         0.819   
2      ['khpkridhamardawakaratonngayogyakartahadining...         0.328   
3                                        ['frankparker']         0.275   
4                                          ['philregan']         0.418   
...                                                  ...           ...   
24

ValueError: Can only compare identically-labeled Series objects

In [62]:
song_name = tracks.loc[tracks.get("name") == "I Might Fall Back On You"]
print(song_name)

       valence  year  acousticness                            artists  \
24989    0.877  1951         0.909  ['margechampion','gowerchampion']   

       danceability  duration_ms  energy  explicit                      id  \
24989         0.667       196973   0.323         0  6MMF38qO03XJ7puIxVl8Gg   

      instrumentalness  key  liveness  loudness  mode  \
24989                     1    0.0301   -12.993     1   

                           name popularity release_date  speechiness   tempo  \
24989  I Might Fall Back On You              1951-09-24        0.135  125.89   

       cluster_label  
24989              0  


In [None]:
df = pd.read_csv(r"C:/Users/shris/Desktop/profiles.csv")
print(df.gender)
df.columns = ['userid', 'gender', 'age', 'country']

In [None]:
df.head()

In [None]:
songs = df.gender.tolist()
output = {}
outputs = []
for song in songs:
    k = recommend_songs(song)
    output.update( {song : k})


In [None]:
output

In [None]:
df1 = pd.DataFrame.from_dict(output, orient='index')

In [None]:
df1.to_csv('content_based.csv')