### 1. Importing libraries

In [222]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline

### 2. Loading the song dataset

In [223]:
songs = pd.read_csv('user_spotify_v3.json.tracks1.csv')
print(songs.shape)
songs.head(10)

(109233, 15)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p
7,0.316,0.661,212120,0.715,0.0,5,0.178,-5.651,0,0.119,148.027,4,0.411,,2amzBJRBPOGszBem4FedfE
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ


In [224]:
songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0
mean,0.380904,0.552116,269176.0,0.543602,0.148816,5.229436,0.194541,-10.618803,0.664369,0.151028,117.32221,3.850732,0.440351
std,0.363885,0.174904,246220.8,0.286918,0.301031,3.561608,0.167634,7.077973,0.472213,0.245035,30.634027,0.568357,0.252394
min,0.0,0.0,1155.0,0.0,0.0,0.0,0.0,-58.555,0.0,0.0,0.0,0.0,0.0
25%,0.0293,0.443,189493.0,0.303,0.0,2.0,0.0968,-13.548,0.0,0.0354,94.995,4.0,0.236
50%,0.248,0.574,223962.0,0.576,0.000109,5.0,0.125,-8.279,1.0,0.0488,116.538,4.0,0.417
75%,0.753,0.678,272000.0,0.792,0.0496,8.0,0.237,-5.577,1.0,0.103,135.522,4.0,0.632
max,0.996,0.985,5925082.0,1.0,1.0,11.0,0.997,1.974,1.0,0.969,232.69,5.0,0.999


In [225]:
# Removing duplicate rows and rows with null values
print("Original shape: {}".format(songs.shape))
songs.drop_duplicates(inplace=True)
songs.dropna(how='any', inplace=True)
print("Shape of dataset after modifications: {}".format(songs.shape))

Original shape: (109233, 15)
Shape of dataset after modifications: (56452, 15)


In [226]:
# Getting genres (taking the first genre of the list)
genre = []

for s in songs['genres']:
    g = s[:s.find(" ")]
    genre.append(g)
#     print(s)
    
songs['genre'] = genre

In [227]:
songs.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id,genre
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI,pop
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL,dance
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf,pop
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU,dance
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV,hip
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs,dance
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p,dance
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6,dance
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ,dance
10,0.413,0.827,187250,0.419,0.0,10,0.115,-10.329,0,0.112,119.974,4,0.227,underground hip hop,3al2hpm92xE0pBalqWQHdD,underground


In [228]:
songs = songs.reset_index(drop=True)

### Selecting four features to define similarity: acousticness, danceability, energy and liveness

We need to take a couple of steps:
1. Scale all the data
2. Select a random song for a particular genre 
3. Get the closest X songs on those features (by euclidean distance)




#### 1. Scaling the data

In [229]:
# Getting features
features = songs.iloc[:,:(songs.shape[1]-3)]


# Scaling featues
scaler = MinMaxScaler().fit(features)
data = scaler.transform(features)
data = pd.DataFrame(data, columns= features.columns)
data['genre'] = songs['genre']
data['id'] = songs['id']

In [230]:
data.head(100)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id
0,0.782129,0.595939,0.047149,0.299,0.000000,0.727273,0.123370,0.827932,1.0,0.027198,0.408234,0.6,0.356356,pop,1bhUWB0zJMIKr9yVPrkEuI
1,0.245984,0.647716,0.037332,0.658,0.000004,0.272727,0.092177,0.847222,1.0,0.047156,0.451571,0.8,0.330330,dance,2xmrfQpmS2iJExTlklLoAL
2,0.635542,0.776650,0.041680,0.688,0.000000,0.363636,0.073621,0.861078,1.0,0.086970,0.386837,0.8,0.434434,pop,42CeaId2XNlxugDvyqHfDf
3,0.129518,0.730964,0.035917,0.807,0.000000,1.000000,0.183551,0.879060,0.0,0.044674,0.536963,0.8,0.305305,dance,0tBbt8CrmxbjRP0pueQkyU
4,0.004147,0.662944,0.036795,0.718,0.000000,0.272727,0.053862,0.867232,0.0,0.220269,0.352546,0.8,0.216216,hip,0OI7AFifLSoGzpb8bdBLLV
5,0.083835,0.657868,0.034576,0.608,0.000000,0.727273,0.105316,0.868558,1.0,0.060703,0.542009,0.8,0.488488,dance,7eFmN6wnsb7WowRKAqRFfs
6,0.032129,0.677157,0.031685,0.726,0.000000,0.727273,0.074724,0.886762,1.0,0.055843,0.442653,0.8,0.770771,dance,5Gu0PDLN4YJeW75PpBSg9p
7,0.023394,0.857868,0.034006,0.709,0.000000,0.909091,0.094283,0.879853,0.0,0.073837,0.421428,0.8,0.620621,dance,2z4pcBLQXF2BXKFvd0BuB6
8,0.399598,0.762437,0.036118,0.579,0.000023,0.181818,0.133400,0.889268,1.0,0.033195,0.451377,0.8,0.349349,dance,3whrwq4DtvucphBPUogRuJ
9,0.414659,0.839594,0.033957,0.419,0.000000,0.909091,0.115346,0.773321,0.0,0.115822,0.515596,0.8,0.227227,underground,3al2hpm92xE0pBalqWQHdD


#### 2. Select a random song from the given genre

In [231]:
print("Number of genres available: {}".format(len(data['genre'].unique())))
data['genre'].unique()

Number of genres available: 716


array(['pop', 'dance', 'hip', 'underground', 'bmore', 'dwn', 'latin',
       'rap', 'big', 'brostep', 'edm', 'detroit', 'drill', 'post-teen',
       'dirty', 'deep', 'east', 'ra', 'crunk', 'bass', 'indie',
       'chillwave', 'danish', 'canadian', 'irish', 'blues-rock',
       'alternative', 'alt-indie', 'escape', 'modern', 'emo', 'garage',
       'melodic', 'glam', 'folk-pop', 'contemporary', 'australian', 'lift',
       'christian', 'chicago', 'bachata', 'trap', 'reggaeton', 'cumbia',
       'reggaeto', 'colombian', 'aussietronica', 'house', 'chamber', 'boy',
       'acoustic', 'vapor', 'brooklyn', 'progressive', 'quebecoi',
       'indiecoustic', 'viral', 'channel', 'dreamo', 'folk-po', 'focu',
       'classify', 'compositional', 'new', 'ambient', 'soul', 'nu', 'bow',
       'scorecore', 'focus', 'austindie', 'funk', 'neo', 'folk', 'freak',
       'portland', 'michigan', 'chanson', 'anti-folk', 'vancouver',
       'norwegian', 'seattle', 'electroclash', 'bay', 'adult', 'tropical',
 

In [232]:
selected_genre = 'pop'
N = 50

genre_data = data[data.genre==selected_genre]


ind = data[data.genre==selected_genre].index
r = np.random.choice(ind,1)[0]


In [233]:
seed = data.iloc[r,:]
seed

acousticness                      0.284137
danceability                      0.828426
duration_ms                      0.0437185
energy                               0.851
instrumentalness                         0
key                               0.727273
liveness                          0.544634
loudness                          0.886872
mode                                     0
speechiness                       0.196484
tempo                             0.592926
time_signature                         0.8
valence                           0.568569
genre                                dance
id                  1ARJhjuI6TNYZCxYygFQ4F
Name: 43343, dtype: object

In [234]:
# Getting feature values for our seed song
acousticness = seed.acousticness
danceability = seed.danceability
energy = seed.energy
liveness = seed.liveness

#### 3. Get the closest N songs on those features (by euclidean distance)

In [235]:
# Calculating euclidean distance for every song with respect to the seed song
distance = []

for i in genre_data.index:
#     print(i)
    d = np.sqrt((genre_data.loc[i,'acousticness']-acousticness)**2 + (genre_data.loc[i,'danceability']-danceability)**2 + (genre_data.loc[i,'energy']-energy)**2 + (genre_data.loc[i,'liveness']-liveness)**2)
    distance.append(d)
    
distance

[0.5254318584430302,
 0.40709066964352608,
 0.56679802111181432,
 0.56818176928998443,
 0.54021641748088445,
 0.51066941863330917,
 0.55180710008814515,
 0.55961727127666527,
 0.51885494109757835,
 0.54056421970441393,
 0.54600826057501528,
 0.71826825083966406,
 0.5427045244830826,
 0.34364759591880562,
 0.84724610103059483,
 0.63908039692557328,
 0.51498452989950438,
 0.52356726450042235,
 0.56111514748003755,
 0.46429544651083182,
 0.38611482720663787,
 0.47662260370881693,
 0.46831037688575433,
 0.4414090566245028,
 0.40676126716490374,
 0.61738336695789853,
 0.25545314115009682,
 0.55252587815150134,
 0.35625536452742451,
 0.58244451998298519,
 0.63297720279014724,
 0.50822438887827581,
 0.47183762035528887,
 0.72010076477313389,
 0.6077459640755738,
 0.67836232459561885,
 0.48747560646723864,
 0.60689932289746773,
 0.59633596855539284,
 0.6986012087626825,
 0.5649331669560419,
 0.68646906992630419,
 0.63702965251687249,
 0.51851848782927112,
 0.53193067437693553,
 0.7305809478126

In [236]:
genre_data = genre_data.reset_index(drop=True)
genre_data['distance'] = distance
genre_data

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
0,0.245984,0.647716,0.037332,0.658,0.000004,0.272727,0.092177,0.847222,1.0,0.047156,0.451571,0.8,0.330330,dance,2xmrfQpmS2iJExTlklLoAL,0.525432
1,0.129518,0.730964,0.035917,0.807,0.000000,1.000000,0.183551,0.879060,0.0,0.044674,0.536963,0.8,0.305305,dance,0tBbt8CrmxbjRP0pueQkyU,0.407091
2,0.083835,0.657868,0.034576,0.608,0.000000,0.727273,0.105316,0.868558,1.0,0.060703,0.542009,0.8,0.488488,dance,7eFmN6wnsb7WowRKAqRFfs,0.566798
3,0.032129,0.677157,0.031685,0.726,0.000000,0.727273,0.074724,0.886762,1.0,0.055843,0.442653,0.8,0.770771,dance,5Gu0PDLN4YJeW75PpBSg9p,0.568182
4,0.023394,0.857868,0.034006,0.709,0.000000,0.909091,0.094283,0.879853,0.0,0.073837,0.421428,0.8,0.620621,dance,2z4pcBLQXF2BXKFvd0BuB6,0.540216
5,0.399598,0.762437,0.036118,0.579,0.000023,0.181818,0.133400,0.889268,1.0,0.033195,0.451377,0.8,0.349349,dance,3whrwq4DtvucphBPUogRuJ,0.510669
6,0.047189,0.652792,0.029691,0.783,0.000000,0.909091,0.083250,0.844643,1.0,0.088521,0.662186,0.8,0.579580,dance,7iDa6hUg2VgEL1o1HjmfBn,0.551807
7,0.077008,0.808122,0.036256,0.606,0.000003,0.454545,0.086560,0.866826,0.0,0.067942,0.472762,0.8,0.420420,dance,04JL2liXXV9B9coeGuUsPw,0.559617
8,0.002751,0.767513,0.037986,0.682,0.000009,0.818182,0.147442,0.842450,0.0,0.077870,0.498552,0.8,0.589590,dance,6tF92PMv01Ug9Dh8Rmy6nH,0.518855
9,0.060241,0.685279,0.032867,0.736,0.000000,0.000000,0.088164,0.897227,1.0,0.031231,0.508698,0.8,0.607608,dance,7y9iMe8SOB6z3NoHE2OfXl,0.540564


In [237]:
genre_data = genre_data.sort_values(by=['distance'], ascending=True)

In [238]:
playlist = genre_data.iloc[:N,:]
print(playlist.shape)
playlist

(50, 16)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
3100,0.284137,0.828426,0.043718,0.851,0.0,0.727273,0.544634,0.886872,0.0,0.196484,0.592926,0.8,0.568569,dance,1ARJhjuI6TNYZCxYygFQ4F,0.0
745,0.246988,0.759391,0.050604,0.815,0.0,0.545455,0.548646,0.873846,0.0,0.105481,0.550389,0.8,0.848849,dance,33LLMc7LEuf4tc8h0NzhvC,0.08636
3109,0.324297,0.77868,0.039217,0.82,0.0,0.909091,0.482447,0.857098,0.0,0.078594,0.429679,0.8,0.754755,dance,4JtVOeyOJyPLg72MbGoNn8,0.094423
2609,0.283133,0.738071,0.036286,0.866,4e-06,1.0,0.58676,0.884625,1.0,0.191313,0.545834,0.8,0.653654,dance,0iYebKFUSfF72fUu2OW6ZT,0.10082
2292,0.291165,0.73198,0.036286,0.867,3e-06,1.0,0.504514,0.884367,1.0,0.202689,0.545932,0.8,0.63964,dance,7gKIt3rDGIMJDFVSPBnGmj,0.10591
2496,0.354418,0.767513,0.030154,0.87,0.0,1.0,0.496489,0.907269,0.0,0.124095,0.558705,0.8,0.654655,dance,31zwmEokj7CXoY6elLIbMZ,0.106437
3407,0.153614,0.77868,0.04783,0.872,0.0,0.909091,0.519559,0.885325,0.0,0.078697,0.483906,0.8,0.565566,dance,74OqjT3PMVWlZAyqUt4FgG,0.143459
1142,0.285141,0.789848,0.039585,0.824,0.0,0.909091,0.406219,0.855071,0.0,0.07363,0.429859,0.8,0.756757,dance,3ZFTkvIE7kyPt6Nu3PEa7V,0.146209
1766,0.312249,0.706599,0.040492,0.909,0.0,0.454545,0.467402,0.898277,0.0,0.071872,0.502858,0.8,0.645646,dance,3b7OgfU9SY8C7YBJgTKS74,0.15799
3785,0.325301,0.782741,0.039621,0.759,0.0,0.909091,0.429288,0.814666,0.0,0.084902,0.429877,0.8,0.774775,dance,7sLxtsFaexe1yVDtkUsddw,0.159845
