In [32]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [33]:
# the dataset is from https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks?select=tracks.csv
df = pd.read_csv('.tracks.csv')
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [34]:
df['years_old'] = df['release_date'].apply(lambda x: 2021 - int(x.split('-')[0]))
df.drop('release_date', axis=1, inplace=True)

In [35]:
features = ['popularity', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [36]:
X = df[features]
nn = NearestNeighbors(n_neighbors=10,algorithm='ball_tree')
nn.fit(X)

NearestNeighbors(algorithm='ball_tree', n_neighbors=10)

In [37]:
# For "testing" this model, I'm going to use songs from my favorite artists first, then I'll use random ones.

In [38]:
y = df.loc[81319][features] # YBWM by Taylor Swift (the og version)
y = np.array(y).reshape(1, -1)

In [39]:
distances, indexes = nn.radius_neighbors(y, 1) # within a radius of 2 units
distances, indexes

(array([array([0.89967786, 0.        , 0.46636364])], dtype=object),
 array([array([132266,  81319,  85145])], dtype=object))

In [40]:
for i, dist in enumerate(distances):
    print(df['name'].loc[indexes[i]])

132266             Long Live
81319     You Belong With Me
85145             Part Of Me
Name: name, dtype: object


In [41]:
y2 = df.loc[93322][features] # Borderline by Tame Impala
y2 = np.array(y2).reshape(1, -1)
d, i = nn.radius_neighbors(y2, 2)
for j, dist in enumerate(d):
    print(df['name'].loc[i[j]])

94080                                           Solo
93390                                       La Mitad
91077                                Te Boté - Remix
93322                                     Borderline
94032    Only Wanna Be With You - Pokémon 25 Version
Name: name, dtype: object


In [42]:
y = df.loc[192047][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.94898631, 0.67840822, 0.        ])], dtype=object),
 array([array([471615, 365372, 192047])], dtype=object))

In [43]:
df['name'].loc[365372], df['name'].loc[192047]

('İntizar', 'El Negrito Ñengere')

In [44]:
# The last two sounded fairly similar, both very instrumental
y = df.loc[77777][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.9261112, 0.9415847, 0.       ])], dtype=object),
 array([array([138141, 354763,  77777])], dtype=object))

In [45]:
df['name'].loc[77777], df['name'].loc[354763]

('Never Too Late - Live at Buxton Opera House, 2004',
 'Golemiat Chas - The Big Hour')

In [46]:
# The last two sounded fairly similar, both very instrumental
y = df.loc[41820][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.9664188 , 0.92794688, 0.57507739, 0.63525957, 0.91583807,
               0.        , 0.43182715])                                   ],
       dtype=object),
 array([array([146855, 574560, 377640, 528245, 217004,  41820, 274594])],
       dtype=object))

In [47]:
df.loc[41820], df.loc[274594]

(id                                 608HuKHFh2N62z3Ja5vxXj
 name                I Love You More Than You'll Ever Know
 popularity                                             43
 duration_ms                                        356893
 explicit                                                0
 artists                          ['Blood, Sweat & Tears']
 id_artists                     ['24GaH9tRBgZjlvOhpFuKi2']
 danceability                                        0.456
 energy                                              0.391
 key                                                     0
 loudness                                           -9.687
 mode                                                    0
 speechiness                                        0.0419
 acousticness                                        0.665
 instrumentalness                                  0.00008
 liveness                                            0.209
 valence                                             0.4

In [48]:
y = df.loc[327690][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.        , 0.88651311])], dtype=object),
 array([array([327690, 457396])], dtype=object))

In [49]:
df['name'].loc[327690], df['name'].loc[457396]

('วัยหวาน', 'La Machine')

## Exporting the model

In [53]:
import pickle

In [54]:
pickle.dump(nn, open('nnmodel.pkl', 'wb'))

In [None]:
loaded_model = pickle.load(open('nnmodel.pkl', 'rb'))

In [None]:
loaded_model.radius_neighbors(y, 1)

In [None]:
# it works!