## Part 1: Cleaning Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
# the dataset is from https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks?select=tracks.csv
df = pd.read_csv('./tracks.csv')
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [3]:
df['years_old'] = df['release_date'].apply(lambda x: 2021 - int(x.split('-')[0]))
df.drop(['duration_ms', 'id_artists', 'release_date'], axis=1, inplace=True)
df.drop('name', axis=1).to_csv('./tracks_clean.csv')

In [4]:
features = ['popularity', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [5]:
X = df[features]
nn = NearestNeighbors(n_neighbors=10,algorithm='ball_tree')
nn.fit(X)

NearestNeighbors(algorithm='ball_tree', n_neighbors=10)

## Part 2: "Testing"

In [6]:
y = df.loc[81319][features] # YBWM by Taylor Swift (the og version)
y = np.array(y).reshape(1, -1)

In [7]:
distances, indexes = nn.radius_neighbors(y, 1) # within a radius of 1 unit
distances, indexes

(array([array([0.89967786, 0.        , 0.46636364])], dtype=object),
 array([array([132266,  81319,  85145])], dtype=object))

In [8]:
for i, dist in enumerate(distances):
    print(df['name'].loc[indexes[i]])

132266             Long Live
81319     You Belong With Me
85145             Part Of Me
Name: name, dtype: object


In [9]:
y2 = df.loc[93322][features] # Borderline by Tame Impala
y2 = np.array(y2).reshape(1, -1)
d, i = nn.radius_neighbors(y2, 2)
for j, dist in enumerate(d):
    print(df['name'].loc[i[j]])

94080                                           Solo
93390                                       La Mitad
91077                                Te Boté - Remix
93322                                     Borderline
94032    Only Wanna Be With You - Pokémon 25 Version
Name: name, dtype: object


In [10]:
y = df.loc[192047][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.94898631, 0.67840822, 0.        ])], dtype=object),
 array([array([471615, 365372, 192047])], dtype=object))

In [11]:
df['name'].loc[365372], df['name'].loc[192047]

('İntizar', 'El Negrito Ñengere')

In [12]:
# The last two sounded fairly similar, both very instrumental
y = df.loc[77777][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.9261112, 0.9415847, 0.       ])], dtype=object),
 array([array([138141, 354763,  77777])], dtype=object))

In [13]:
df['name'].loc[77777], df['name'].loc[354763]

('Never Too Late - Live at Buxton Opera House, 2004',
 'Golemiat Chas - The Big Hour')

In [14]:
y = df.loc[41820][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.9664188 , 0.92794688, 0.57507739, 0.63525957, 0.91583807,
               0.        , 0.43182715])                                   ],
       dtype=object),
 array([array([146855, 574560, 377640, 528245, 217004,  41820, 274594])],
       dtype=object))

In [15]:
df.loc[41820], df.loc[274594]

(id                                 608HuKHFh2N62z3Ja5vxXj
 name                I Love You More Than You'll Ever Know
 popularity                                             43
 explicit                                                0
 artists                          ['Blood, Sweat & Tears']
 danceability                                        0.456
 energy                                              0.391
 key                                                     0
 loudness                                           -9.687
 mode                                                    0
 speechiness                                        0.0419
 acousticness                                        0.665
 instrumentalness                                  0.00008
 liveness                                            0.209
 valence                                             0.411
 tempo                                              133.95
 time_signature                                         

In [16]:
y = df.loc[327690][features] #  random
y = np.array(y).reshape(1, -1)
distances, indexes = nn.radius_neighbors(y, 1)
distances, indexes

(array([array([0.        , 0.88651311])], dtype=object),
 array([array([327690, 457396])], dtype=object))

In [17]:
df['name'].loc[327690], df['name'].loc[457396]
# these sound VERY similar

('วัยหวาน', 'La Machine')

## Part 3: One-Hot Encoding Categorical Features (Coming Soon)

In [None]:
df

In [None]:
def get_artists(val):
    with_str = val.strip("[]").split(", ")
    new_vals = []
    for artist in with_str:
        new_vals.append(artist.strip("''"))
    return new_vals

get_artists("['Gentle Bones', 'Clara Benin']")

In [None]:
df['artists'] = df['artists'].apply(get_artists)

In [None]:
v = df.artists.values
l = [len(x) for x in v.tolist()]
f, u = pd.factorize(np.concatenate(v))
n, m = len(v), u.size
i = np.arange(n).repeat(l)

dummied = pd.concat([pd.DataFrame(
    np.bincount(i * m + f, minlength=n * m).reshape(n, m),
    df.index, u
), df[features]])

In [None]:
for column in df:
    if column in df.select_dtypes(include=['int64']).copy():
        if np.allclose(df[column], df[column].astype('int8')):
            df[column] = df[column].astype('int8')
        elif np.allclose(df[column], df[column].astype('int16')):
            df[column] = df[column].astype('int16')
        elif np.allclose(df[column], df[column].astype('int32')):
            df[column] = df[column].astype('int32')
    if column in df.select_dtypes(include=['float64']).copy():
        if np.allclose(df[column], df[column].astype('float16')):
            df[column] = df[column].astype('float16')
        elif np.allclose(df[column], df[column].astype('float32')):
            df[column] = df[column].astype('float32')