# Cosine Similarity Algorithm 

## Business Understanding 

Cosine Similarity is an algorithm used to calculate how similar variables are to each other using metrics that describe each variable. What I hope to accomplish with this algorithm is to plug a song in and have the algorithm output 20 songs that are similar to the song plugged in. This will make it easy for users to discover new music and a fast way to create playlists!

In [1]:
import pandas as pd 
import numpy as np 
from numpy import dot 
import operator
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_validate

In [2]:
pd.set_option('display.max_columns', 500)

https://www.datasciencelearner.com/sklearn-cosine-similarity-implementation/

In [3]:
df = pd.read_csv('../data/final_df.csv', index_col=0)

In [4]:
df

Unnamed: 0,track_id,track_name,track_popularity,duration_ms,explicit,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_id,followers,genres,artist_name,artist_popularity
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,1922-02-22,0.645,0.44500,0,-13.338,1,0.4510,0.674,0.744000,0.1510,0.127,104.851,3,45tIt06XoI0Iio4LBEVpls,91.0,,Uli,4
1,0PH9AACae1f957JAavhOl2,Lazy Boi,0,157333,0,1922-02-22,0.298,0.46000,1,-18.645,1,0.4530,0.521,0.856000,0.4360,0.402,87.921,4,45tIt06XoI0Iio4LBEVpls,91.0,,Uli,4
2,2SiNuAZ6jIU9xhClRKXcST,Sketch,0,87040,0,1922-02-22,0.634,0.00399,5,-29.973,0,0.0377,0.926,0.919000,0.1050,0.396,79.895,4,45tIt06XoI0Iio4LBEVpls,91.0,,Uli,4
3,4vV7uBcF2AnjNTOejBS5oL,L'enfer,0,40000,0,1922-02-22,0.657,0.32500,10,-14.319,0,0.2540,0.199,0.856000,0.0931,0.105,81.944,5,45tIt06XoI0Iio4LBEVpls,91.0,,Uli,4
4,598LlBn6jpEpVbLjmZPsYV,Graphite,0,104400,0,1922-02-22,0.644,0.68400,7,-8.247,1,0.1990,0.144,0.802000,0.0847,0.138,100.031,4,45tIt06XoI0Iio4LBEVpls,91.0,,Uli,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470033,0MmaEacabpK8Yp3Mdeo5uY,下雨天,50,265846,0,2020-02-25,0.528,0.67300,4,-3.639,1,0.0314,0.143,0.000000,0.0989,0.297,130.066,4,5VGgFE9nPgMfEnYiPT5J2B,929.0,chinese viral pop,芝麻,36
470034,1dKxf4Ht2SsKLyXfSDJAgy,The Cutest Puppy,67,82500,0,2020-10-30,0.609,0.01720,8,-28.573,1,0.1180,0.996,0.973000,0.1080,0.890,68.619,4,7vgGpuiXdNlCmc994PlMlz,23.0,instrumental lullaby,Laureen Conrad,52
470035,0SjsIzJkZfDU7wlcdklEFR,John Brown's Song,66,185250,0,2020-03-20,0.562,0.03310,1,-25.551,1,0.1030,0.996,0.961000,0.1110,0.386,63.696,3,4MxqhahGRT4BPz1PilXGeu,91.0,instrumental lullaby,Gregory Oberle,55
470036,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,2020-09-26,0.560,0.51800,0,-7.471,0,0.0292,0.785,0.000000,0.0648,0.211,131.896,4,1QLBXKM5GCpyQQSVMNZqrZ,896.0,chinese viral pop,阿YueYue,38


Getting rid of null values in genres

In [5]:
df = df.dropna(subset=['genres'])

In [6]:
df.isna().sum()

track_id             0
track_name           0
track_popularity     0
duration_ms          0
explicit             0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
artist_id            0
followers            0
genres               0
artist_name          0
artist_popularity    0
dtype: int64

I am going to create a variable with all of the numeric columns to scale and have them easily accessible when I plug them into functions. 

In [7]:
num_cols = ['danceability', 'energy', 'loudness', 'key', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_popularity']

In [8]:
df['track_id']

56        07A5yehtSnoedViJAZkNnc
57        08FmqUhxtyLTn6pAh6bk45
58        0JV4iqw2lSKJaHBQZ0e5zK
59        0l3BQsVJ7F76wlN5QhJzaP
60        0xJCJ9XSNcdTIz0QKmhtEn
                   ...          
470033    0MmaEacabpK8Yp3Mdeo5uY
470034    1dKxf4Ht2SsKLyXfSDJAgy
470035    0SjsIzJkZfDU7wlcdklEFR
470036    5rgu12WBIHQtvej2MdHSH0
470037    0NuWgxEp51CutD2pJoF4OM
Name: track_id, Length: 432228, dtype: object

Scaling my numeric data so that it will run smoothly through the cosine similarity algorithm.

In [9]:
scaler = StandardScaler()

scaler.fit(df[num_cols])

StandardScaler()

In [10]:
scaled_data = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols)

In [11]:
scaled_data

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
0,-0.823764,-1.595823,-2.533133,-1.203130,0.712639,-0.265077,1.719770,-0.279239,-0.010532,-0.419442,0.367961,2.512874,-1.733620
1,-1.534985,-1.936469,-4.034035,0.501498,0.712639,-0.269892,1.722753,3.581239,-0.590591,-0.656883,1.703360,-1.992675,-1.733620
2,-0.106249,-2.165909,-2.986436,0.217393,0.712639,0.347572,1.725736,3.645853,-0.354271,0.193948,-1.482568,-1.992675,-1.733620
3,-1.226580,-0.789683,-1.638480,-0.919025,0.712639,-0.052636,1.692923,-0.370211,3.840421,0.751936,1.355317,0.260100,-1.733620
4,-1.421693,-1.931508,-3.808933,0.785602,0.712639,-0.254245,1.719770,3.800927,-0.488544,-1.361293,-1.121934,-1.992675,-1.733620
...,...,...,...,...,...,...,...,...,...,...,...,...,...
432223,-0.232129,0.454667,1.349378,-0.350816,0.712639,-0.384237,-0.818768,-0.373145,-0.617983,-1.052619,0.356079,0.260100,1.190996
432224,0.277684,-2.256445,-4.169494,0.785602,0.712639,0.136936,1.725736,3.818157,-0.569108,1.294093,-1.718040,0.260100,2.185366
432225,-0.018133,-2.190713,-3.500607,-1.203130,0.712639,0.046664,1.725736,3.766466,-0.552995,-0.700414,-1.884214,-1.992675,2.126874
432226,-0.030721,-0.186111,0.501206,-1.487234,-1.403235,-0.397477,1.096322,-0.373145,-0.801132,-1.392951,0.417850,0.260100,1.190996


Making a new data frame to have track_id, track_name, and artist_name to plug into the algorithm.

In [12]:
df_track_info = df[['track_id', 'track_name', 'artist_name']]
df_track_info

Unnamed: 0,track_id,track_name,artist_name
56,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini
57,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini
58,0JV4iqw2lSKJaHBQZ0e5zK,Martín Fierro - Remasterizado,Ignacio Corsini
59,0l3BQsVJ7F76wlN5QhJzaP,El Vendaval - Remasterizado,Ignacio Corsini
60,0xJCJ9XSNcdTIz0QKmhtEn,La Maleva - Remasterizado,Ignacio Corsini
...,...,...,...
470033,0MmaEacabpK8Yp3Mdeo5uY,下雨天,芝麻
470034,1dKxf4Ht2SsKLyXfSDJAgy,The Cutest Puppy,Laureen Conrad
470035,0SjsIzJkZfDU7wlcdklEFR,John Brown's Song,Gregory Oberle
470036,5rgu12WBIHQtvej2MdHSH0,云与海,阿YueYue


In [13]:
df_track_info.head()

Unnamed: 0,track_id,track_name,artist_name
56,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini
57,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini
58,0JV4iqw2lSKJaHBQZ0e5zK,Martín Fierro - Remasterizado,Ignacio Corsini
59,0l3BQsVJ7F76wlN5QhJzaP,El Vendaval - Remasterizado,Ignacio Corsini
60,0xJCJ9XSNcdTIz0QKmhtEn,La Maleva - Remasterizado,Ignacio Corsini


In [16]:
#df_track_info.join(scaled_data).isna().sum()

track_id                0
track_name              0
artist_name             0
danceability        30810
energy              30810
loudness            30810
key                 30810
mode                30810
speechiness         30810
acousticness        30810
instrumentalness    30810
liveness            30810
valence             30810
tempo               30810
time_signature      30810
track_popularity    30810
dtype: int64

The following code is from 
https://stackoverflow.com/questions/36538780/merging-dataframes-on-index-with-pandas

New dataframe with the scaled data and information on the track_id, track_name, and artist_name. 
- data is organized and ready to go through cosine similarity

In [42]:
df_mark = df_track_info.join(scaled_data, how='inner')
df_mark

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
56,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini,-1.446869,-1.930681,-3.926021,1.637916,-1.403235,-0.041803,1.725736,2.926482,-0.300562,0.379944,0.933857,2.512874,-1.733620
57,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini,-1.062936,-1.310573,-2.526493,0.217393,0.712639,-0.233181,1.707838,3.473548,-0.262965,0.415560,-1.879927,0.260100,-1.733620
58,0JV4iqw2lSKJaHBQZ0e5zK,Martín Fierro - Remasterizado,Ignacio Corsini,-1.723805,-1.695040,-2.734995,-0.066712,0.712639,-0.220543,1.722753,-0.292162,-0.144805,0.003995,1.945178,0.260100,-1.733620
59,0l3BQsVJ7F76wlN5QhJzaP,El Vendaval - Remasterizado,Ignacio Corsini,-0.792294,-1.674370,-2.487980,0.217393,-1.403235,-0.132678,1.704855,3.340013,-0.563737,0.724234,1.656644,0.260100,-1.733620
60,0xJCJ9XSNcdTIz0QKmhtEn,La Maleva - Remasterizado,Ignacio Corsini,-1.465751,-1.178284,-2.248712,0.785602,0.712639,-0.123049,1.698889,3.775081,-0.198514,-0.063280,2.026054,-1.992675,-1.733620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432223,65o7zOY79D5vqOJJNm1l3T,下雨的晚上,Dadado Huang,-0.232129,0.454667,1.349378,-0.350816,0.712639,-0.384237,-0.818768,-0.373145,-0.617983,-1.052619,0.356079,0.260100,1.190996
432224,7D9yBn5ivJUao1v4jmVdgG,25歲,Dadado Huang,0.277684,-2.256445,-4.169494,0.785602,0.712639,0.136936,1.725736,3.818157,-0.569108,1.294093,-1.718040,0.260100,2.185366
432225,6di4lDxW9XThds6gIHVRtL,跟你出去玩,Dadado Huang,-0.018133,-2.190713,-3.500607,-1.203130,0.712639,0.046664,1.725736,3.766466,-0.552995,-0.700414,-1.884214,-1.992675,2.126874
432226,4EoOSTT7iBjHxSfOfmB8Iq,香格里拉,Dadado Huang,-0.030721,-0.186111,0.501206,-1.487234,-1.403235,-0.397477,1.096322,-0.373145,-0.801132,-1.392951,0.417850,0.260100,1.190996


In [18]:
# refers to the columns to the right of artist name (all num cols)
df_mark.iloc[:,3:]

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
56,-1.446869,-1.930681,-3.926021,1.637916,-1.403235,-0.041803,1.725736,2.926482,-0.300562,0.379944,0.933857,2.512874,-1.733620
57,-1.062936,-1.310573,-2.526493,0.217393,0.712639,-0.233181,1.707838,3.473548,-0.262965,0.415560,-1.879927,0.260100,-1.733620
58,-1.723805,-1.695040,-2.734995,-0.066712,0.712639,-0.220543,1.722753,-0.292162,-0.144805,0.003995,1.945178,0.260100,-1.733620
59,-0.792294,-1.674370,-2.487980,0.217393,-1.403235,-0.132678,1.704855,3.340013,-0.563737,0.724234,1.656644,0.260100,-1.733620
60,-1.465751,-1.178284,-2.248712,0.785602,0.712639,-0.123049,1.698889,3.775081,-0.198514,-0.063280,2.026054,-1.992675,-1.733620
...,...,...,...,...,...,...,...,...,...,...,...,...,...
432223,-0.232129,0.454667,1.349378,-0.350816,0.712639,-0.384237,-0.818768,-0.373145,-0.617983,-1.052619,0.356079,0.260100,1.190996
432224,0.277684,-2.256445,-4.169494,0.785602,0.712639,0.136936,1.725736,3.818157,-0.569108,1.294093,-1.718040,0.260100,2.185366
432225,-0.018133,-2.190713,-3.500607,-1.203130,0.712639,0.046664,1.725736,3.766466,-0.552995,-0.700414,-1.884214,-1.992675,2.126874
432226,-0.030721,-0.186111,0.501206,-1.487234,-1.403235,-0.397477,1.096322,-0.373145,-0.801132,-1.392951,0.417850,0.260100,1.190996


In [19]:
# looks at just the first three track data columns
df_mark.iloc[:,:3]

Unnamed: 0,track_id,track_name,artist_name
56,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini
57,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,Ignacio Corsini
58,0JV4iqw2lSKJaHBQZ0e5zK,Martín Fierro - Remasterizado,Ignacio Corsini
59,0l3BQsVJ7F76wlN5QhJzaP,El Vendaval - Remasterizado,Ignacio Corsini
60,0xJCJ9XSNcdTIz0QKmhtEn,La Maleva - Remasterizado,Ignacio Corsini
...,...,...,...
432223,65o7zOY79D5vqOJJNm1l3T,下雨的晚上,Dadado Huang
432224,7D9yBn5ivJUao1v4jmVdgG,25歲,Dadado Huang
432225,6di4lDxW9XThds6gIHVRtL,跟你出去玩,Dadado Huang
432226,4EoOSTT7iBjHxSfOfmB8Iq,香格里拉,Dadado Huang


In [20]:
# From a track_name to a track_id (Track must exactly match)
track = 'House of the Rising Sun'
df_mark[df_mark['track_name'] == track][['artist_name', 'track_id']]

Unnamed: 0,artist_name,track_id
39924,Pete Seeger,2bT8S1aNa6nAcMrDVfw3Xr
74229,The Animals,4mn2kNTqiGLwaUR8JdhJ1l
74234,The Animals,3XC7Jd6SfrQYKZJ6inyRHK
74238,The Animals,1D8MLfoyeJr3W7sEYsDoyv
74306,The Animals,0ev91hkSfD8mR15VBiAHts
119396,Idris Muhammad,0ggo2PQvf6xG7RWF5z1pHN
261078,Five Finger Death Punch,2SgQsZIWs0UC01ibNOBu7q
397123,Gildran,0jIDNggEsTbXOr8h8WBtTa


In [21]:
def find_track_id(track):
    return df_mark[df_mark['track_name'] == track][['artist_name', 'track_id']]

In [22]:
find_track_id("House of the Rising Sun")

Unnamed: 0,artist_name,track_id
39924,Pete Seeger,2bT8S1aNa6nAcMrDVfw3Xr
74229,The Animals,4mn2kNTqiGLwaUR8JdhJ1l
74234,The Animals,3XC7Jd6SfrQYKZJ6inyRHK
74238,The Animals,1D8MLfoyeJr3W7sEYsDoyv
74306,The Animals,0ev91hkSfD8mR15VBiAHts
119396,Idris Muhammad,0ggo2PQvf6xG7RWF5z1pHN
261078,Five Finger Death Punch,2SgQsZIWs0UC01ibNOBu7q
397123,Gildran,0jIDNggEsTbXOr8h8WBtTa


In [23]:
trackID = '2bT8S1aNa6nAcMrDVfw3Xr'

def cosSim(trackID):
    '''
    Takes in a trackID,
    returns top 10 most similar tracks
    '''
    track_num_cols = df_mark[df_mark['track_id'] == trackID].iloc[:,3:]
    cos_sim = cosine_similarity
    return 
    
    #return track_name

In [24]:
cosSim(trackID).values

AttributeError: 'NoneType' object has no attribute 'values'

In [61]:
df_mark.loc[df_mark['track_id'] == '2bT8S1aNa6nAcMrDVfw3Xr']

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
39924,2bT8S1aNa6nAcMrDVfw3Xr,House of the Rising Sun,Pete Seeger,0.982611,-1.033592,-0.88814,-0.919025,-1.403235,-0.182629,1.182829,-0.373138,-0.268336,0.601556,0.1361,0.2601,0.196627


In [25]:
song_0 = df[num_cols][df['track_id'] == '2bT8S1aNa6nAcMrDVfw3Xr']
song_1 = df_mark.iloc[:,3:][df_mark['track_id'] == '1gqkRc9WtOpnGIqxf2Hvzr']

In [26]:
song_1

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
249853,-0.584592,-0.607785,0.266808,-0.919025,0.712639,-0.391459,1.275302,-0.373145,-0.714123,-0.922026,-0.349291,0.2601,1.190996


In [27]:
track_num_cols = df_mark[df_mark['track_id'] == trackID].iloc[:,3:]

In [52]:
for index, row in df_mark.iloc[:,3:][:5].iterrows():
    print(row)#cosine_similarity(track_num_cols, row)
    
# [row for index, row in df_mark.iterrows()]

SyntaxError: invalid syntax (<ipython-input-52-2a7ab5edb01b>, line 2)

In [29]:
song_1

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
249853,-0.584592,-0.607785,0.266808,-0.919025,0.712639,-0.391459,1.275302,-0.373145,-0.714123,-0.922026,-0.349291,0.2601,1.190996


In [46]:
cos_sim = cosine_similarity(song_0, song_1)
cos_sim

array([[0.38190865]])

In [31]:
num_cols

['danceability',
 'energy',
 'loudness',
 'key',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_popularity']

In [32]:
song_0_sc = scaler.transform(song_0)

In [33]:
df_mark.head()[num_cols]

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
56,-1.446869,-1.930681,-3.926021,1.637916,-1.403235,-0.041803,1.725736,2.926482,-0.300562,0.379944,0.933857,2.512874,-1.73362
57,-1.062936,-1.310573,-2.526493,0.217393,0.712639,-0.233181,1.707838,3.473548,-0.262965,0.41556,-1.879927,0.2601,-1.73362
58,-1.723805,-1.69504,-2.734995,-0.066712,0.712639,-0.220543,1.722753,-0.292162,-0.144805,0.003995,1.945178,0.2601,-1.73362
59,-0.792294,-1.67437,-2.48798,0.217393,-1.403235,-0.132678,1.704855,3.340013,-0.563737,0.724234,1.656644,0.2601,-1.73362
60,-1.465751,-1.178284,-2.248712,0.785602,0.712639,-0.123049,1.698889,3.775081,-0.198514,-0.06328,2.026054,-1.992675,-1.73362


In [34]:
df_mark['artist_name'].unique()

array(['Ignacio Corsini', 'Dick Haymes', 'Mistinguett', ...,
       'Green River Ordinance', 'Koala Liu', 'Dadado Huang'], dtype=object)

In [90]:
song_0_sc

array([[-0.55312249, -2.10555171, -3.20866072,  1.35381121,  0.71263911,
        -0.33669367,  1.62133114, -0.33933007, -0.76729484,  0.06731304,
        -0.086714  , -1.99267451, -1.32417391]])

In [89]:
df_mark[num_cols].iloc[0].values.reshape(1, -1)

array([[-1.44686919, -1.9306813 , -3.92602115,  1.63791577, -1.4032348 ,
        -0.04180339,  1.72573637,  2.9264823 , -0.30056155,  0.37994417,
         0.93385655,  2.51287429, -1.73362026]])

In [100]:
df_mark.head(10)[num_cols]

Unnamed: 0,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
56,-1.446869,-1.930681,-3.926021,1.637916,-1.403235,-0.041803,1.725736,2.926482,-0.300562,0.379944,0.933857,2.512874,-1.73362
57,-1.062936,-1.310573,-2.526493,0.217393,0.712639,-0.233181,1.707838,3.473548,-0.262965,0.41556,-1.879927,0.2601,-1.73362
58,-1.723805,-1.69504,-2.734995,-0.066712,0.712639,-0.220543,1.722753,-0.292162,-0.144805,0.003995,1.945178,0.2601,-1.73362
59,-0.792294,-1.67437,-2.48798,0.217393,-1.403235,-0.132678,1.704855,3.340013,-0.563737,0.724234,1.656644,0.2601,-1.73362
60,-1.465751,-1.178284,-2.248712,0.785602,0.712639,-0.123049,1.698889,3.775081,-0.198514,-0.06328,2.026054,-1.992675,-1.73362
61,-0.987408,-1.509008,-2.976919,-0.350816,0.712639,-0.241005,1.678008,3.503702,-0.536882,-1.63435,0.043545,0.2601,-1.73362
62,-0.037015,-1.757051,-2.775279,-0.919025,0.712639,0.130918,1.722753,3.533855,0.129112,-0.27302,-0.276617,0.2601,-1.73362
63,-3.555356,-2.220892,-3.355187,-0.066712,0.712639,-0.573208,1.725736,2.935098,-0.702844,-2.227953,-4.034248,-8.750998,-1.675128
64,-0.767118,-2.09687,-2.78546,-0.634921,0.712639,0.142954,1.71977,3.628622,0.531932,0.063356,-1.522365,0.2601,-1.73362
65,-0.446125,-1.868671,-2.988871,-0.066712,0.712639,-0.180221,1.722753,2.573259,-0.203885,0.28101,0.276688,0.2601,-1.73362


In [106]:
df_mark.shape

(401418, 16)

In [107]:
test = df_mark.groupby('artist_name').first().reset_index()

In [109]:
test_series = test[num_cols].apply(lambda x: cosine_similarity(song_0_sc, x.values.reshape(1, -1)), axis=1)

In [114]:
test_series.str[0].str[0].sort_values().tail(10)

15374    0.905102
17547    0.906397
20864    0.909415
29647    0.913536
26880    0.921018
28406    0.921995
15229    0.927315
14110    0.931193
22585    0.932682
9996     0.957633
dtype: float64

In [67]:
similarity = {}
for t_id in df_mark['track_id']:
    song_1 = df_mark[num_cols][df_mark['track_id'] == t_id]
    sim = cosine_similarity(song_0_sc, song_1)[0][0]
    similarity[t_id] = sim

KeyboardInterrupt: 

In [57]:
sorted_dict = {k:v for k,v in sorted(similarity.items(), key=lambda k: k[1], reverse=True)}
top_10 = list(sorted_dict.keys())[:10]

In [64]:
for track_id in top_10:
    track = df_mark.loc[df_mark['track_id'] == track_id]
    display(track[['artist_name', 'track_name']])

Unnamed: 0,artist_name,track_name
9885,Carroll Gibbons,Dream Mother (Carroll Gibbons)


Unnamed: 0,artist_name,track_name
15512,Django Reinhardt,Twelfth Year


Unnamed: 0,artist_name,track_name
2885,Francisco Canaro,Burrerro - Instrumental (Remasterizado)


Unnamed: 0,artist_name,track_name
17554,Andy Williams,Try to Remember


Unnamed: 0,artist_name,track_name
15380,Glenn Miller,Moonlight Serenade / Running Wild - Live


Unnamed: 0,artist_name,track_name
10529,Stratos Pagioumtzis,Sampah Manes


Unnamed: 0,artist_name,track_name
2872,Francisco Canaro,Primavera de Amor - Instrumental (Remasterizado)


Unnamed: 0,artist_name,track_name
17547,Andy Williams,The Christmas Song (Chestnuts Roasting On an O...


Unnamed: 0,artist_name,track_name
4518,Billie Holiday,"Reading From ""Lady Sings The Blues"" 5 - Live A..."


Unnamed: 0,artist_name,track_name
249,Ignacio Corsini,La Espera - Remasterizado


In [44]:
list(similarity.values())[0]

0.5038497689969865

In [38]:
df_mark.loc[df_mark['track_id'] == list(similarity.keys())[0]]

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,loudness,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_popularity
56,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,Ignacio Corsini,-1.446869,-1.930681,-3.926021,1.637916,-1.403235,-0.041803,1.725736,2.926482,-0.300562,0.379944,0.933857,2.512874,-1.73362
