In this first notebook, I am using Spotify data to see which songs are similar to a given song. Similar songs are simply those songs which are close to the vector embedding of the given song. I construct the vector embedding using (scaled version of) attributes like "acousticness", ..., "valence".

In [None]:
import numpy as np
import pandas as pd
import glob
import tqdm
from collections import OrderedDict
import langid # I want english songs

In [None]:
csv = pd.read_csv(*glob.glob("/kaggle/input/*/data.csv"))

In [None]:
def scale_0_1(df, col):
    df.loc[:, col] = (df[col]-df[col].mean())/df[col].std()

In [None]:
attrs = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence"]
for attr in attrs:
    scale_0_1(csv, attr) # Rescale these attributes to be N(0, 1)
csv.loc[:, "name"] = csv["name"].str.lower()
csv = csv.drop_duplicates(subset=attrs)

In [None]:
# langid.classify is a fast way to check the language of the song name
def check_en(name):
    try:
        return langid.classify(name)[0] in ["en"]
    except:
        return False

tqdm.tqdm.pandas()
csv = csv[csv["name"].progress_apply(lambda x: check_en(x))]

In [None]:
song_name = "i'll do anything"
rows = csv[["name", "id", "artists"]+attrs][csv["name"].str.contains(song_name)]
vec = rows[attrs].iloc[2].to_numpy() # choose the second song "let it be - remastered 2009"
rows

In [None]:
# Slow (takes 2 mins to pass through the dataset)
similar_songs = OrderedDict()
for index, row in tqdm.tqdm(csv.iterrows()):
    other_vec = row[attrs].to_numpy()
    dist = ((vec-other_vec)**2).mean() # Mean squared difference
    similar_songs[dist] = row[["name", "artists"]]

In [None]:
# recommendations
for k, v in sorted(similar_songs.items(), key=lambda x:x[0]):
    if k > 0.05: break
    print(k, v['name'], v['artists'])