### 1. Importing libraries

In [239]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline

### 2. Loading the song dataset

In [240]:
songs = pd.read_csv('user_spotify_v3.json.tracks1.csv')
print(songs.shape)
songs.head(10)

(109233, 15)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p
7,0.316,0.661,212120,0.715,0.0,5,0.178,-5.651,0,0.119,148.027,4,0.411,,2amzBJRBPOGszBem4FedfE
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ


In [241]:
songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0
mean,0.380904,0.552116,269176.0,0.543602,0.148816,5.229436,0.194541,-10.618803,0.664369,0.151028,117.32221,3.850732,0.440351
std,0.363885,0.174904,246220.8,0.286918,0.301031,3.561608,0.167634,7.077973,0.472213,0.245035,30.634027,0.568357,0.252394
min,0.0,0.0,1155.0,0.0,0.0,0.0,0.0,-58.555,0.0,0.0,0.0,0.0,0.0
25%,0.0293,0.443,189493.0,0.303,0.0,2.0,0.0968,-13.548,0.0,0.0354,94.995,4.0,0.236
50%,0.248,0.574,223962.0,0.576,0.000109,5.0,0.125,-8.279,1.0,0.0488,116.538,4.0,0.417
75%,0.753,0.678,272000.0,0.792,0.0496,8.0,0.237,-5.577,1.0,0.103,135.522,4.0,0.632
max,0.996,0.985,5925082.0,1.0,1.0,11.0,0.997,1.974,1.0,0.969,232.69,5.0,0.999


In [242]:
# Removing duplicate rows and rows with null values
print("Original shape: {}".format(songs.shape))
songs.drop_duplicates(inplace=True)
songs.dropna(how='any', inplace=True)
print("Shape of dataset after modifications: {}".format(songs.shape))

Original shape: (109233, 15)
Shape of dataset after modifications: (56452, 15)


In [243]:
# Getting genres (taking the first genre of the list)
genre = []

for s in songs['genres']:
    g = s[:s.find(" ")]
    genre.append(g)
#     print(s)
    
songs['genre'] = genre

In [244]:
songs.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id,genre
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI,pop
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL,dance
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf,pop
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU,dance
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV,hip
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs,dance
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p,dance
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6,dance
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ,dance
10,0.413,0.827,187250,0.419,0.0,10,0.115,-10.329,0,0.112,119.974,4,0.227,underground hip hop,3al2hpm92xE0pBalqWQHdD,underground


In [245]:
songs = songs.reset_index(drop=True)

### Selecting four features to define similarity: acousticness, danceability, energy and liveness

We need to take a couple of steps:
1. Scale all the data
2. Select a random song for a particular genre 
3. Get the closest X songs on those features (by euclidean distance)




#### 1. Scaling the data

In [246]:
# Getting features
features = songs.iloc[:,:(songs.shape[1]-3)]


# Scaling featues
scaler = MinMaxScaler().fit(features)
data = scaler.transform(features)
data = pd.DataFrame(data, columns= features.columns)
data['genre'] = songs['genre']
data['id'] = songs['id']

In [247]:
data.head(100)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id
0,0.782129,0.595939,0.047149,0.299,0.000000,0.727273,0.123370,0.827932,1.0,0.027198,0.408234,0.6,0.356356,pop,1bhUWB0zJMIKr9yVPrkEuI
1,0.245984,0.647716,0.037332,0.658,0.000004,0.272727,0.092177,0.847222,1.0,0.047156,0.451571,0.8,0.330330,dance,2xmrfQpmS2iJExTlklLoAL
2,0.635542,0.776650,0.041680,0.688,0.000000,0.363636,0.073621,0.861078,1.0,0.086970,0.386837,0.8,0.434434,pop,42CeaId2XNlxugDvyqHfDf
3,0.129518,0.730964,0.035917,0.807,0.000000,1.000000,0.183551,0.879060,0.0,0.044674,0.536963,0.8,0.305305,dance,0tBbt8CrmxbjRP0pueQkyU
4,0.004147,0.662944,0.036795,0.718,0.000000,0.272727,0.053862,0.867232,0.0,0.220269,0.352546,0.8,0.216216,hip,0OI7AFifLSoGzpb8bdBLLV
5,0.083835,0.657868,0.034576,0.608,0.000000,0.727273,0.105316,0.868558,1.0,0.060703,0.542009,0.8,0.488488,dance,7eFmN6wnsb7WowRKAqRFfs
6,0.032129,0.677157,0.031685,0.726,0.000000,0.727273,0.074724,0.886762,1.0,0.055843,0.442653,0.8,0.770771,dance,5Gu0PDLN4YJeW75PpBSg9p
7,0.023394,0.857868,0.034006,0.709,0.000000,0.909091,0.094283,0.879853,0.0,0.073837,0.421428,0.8,0.620621,dance,2z4pcBLQXF2BXKFvd0BuB6
8,0.399598,0.762437,0.036118,0.579,0.000023,0.181818,0.133400,0.889268,1.0,0.033195,0.451377,0.8,0.349349,dance,3whrwq4DtvucphBPUogRuJ
9,0.414659,0.839594,0.033957,0.419,0.000000,0.909091,0.115346,0.773321,0.0,0.115822,0.515596,0.8,0.227227,underground,3al2hpm92xE0pBalqWQHdD


#### 2. Select a random song from the given genre

In [248]:
print("Number of genres available: {}".format(len(data['genre'].unique())))
data['genre'].unique()

Number of genres available: 716


array(['pop', 'dance', 'hip', 'underground', 'bmore', 'dwn', 'latin',
       'rap', 'big', 'brostep', 'edm', 'detroit', 'drill', 'post-teen',
       'dirty', 'deep', 'east', 'ra', 'crunk', 'bass', 'indie',
       'chillwave', 'danish', 'canadian', 'irish', 'blues-rock',
       'alternative', 'alt-indie', 'escape', 'modern', 'emo', 'garage',
       'melodic', 'glam', 'folk-pop', 'contemporary', 'australian', 'lift',
       'christian', 'chicago', 'bachata', 'trap', 'reggaeton', 'cumbia',
       'reggaeto', 'colombian', 'aussietronica', 'house', 'chamber', 'boy',
       'acoustic', 'vapor', 'brooklyn', 'progressive', 'quebecoi',
       'indiecoustic', 'viral', 'channel', 'dreamo', 'folk-po', 'focu',
       'classify', 'compositional', 'new', 'ambient', 'soul', 'nu', 'bow',
       'scorecore', 'focus', 'austindie', 'funk', 'neo', 'folk', 'freak',
       'portland', 'michigan', 'chanson', 'anti-folk', 'vancouver',
       'norwegian', 'seattle', 'electroclash', 'bay', 'adult', 'tropical',
 

In [249]:
selected_genre = 'pop'
N = 50

genre_data = data[data.genre==selected_genre]


ind = data[data.genre==selected_genre].index
r = np.random.choice(ind,1)[0]


In [250]:
seed = data.iloc[r,:]
seed

acousticness                    0.00267068
danceability                      0.813198
duration_ms                      0.0453948
energy                               0.604
instrumentalness                         0
key                              0.0909091
liveness                          0.159478
loudness                          0.819401
mode                                     1
speechiness                       0.437435
tempo                             0.507504
time_signature                         0.8
valence                           0.359359
genre                                  pop
id                  3hmrgEKcLEQi0rlk3mJfZ7
Name: 31641, dtype: object

In [251]:
# Getting feature values for our seed song
acousticness = seed.acousticness
danceability = seed.danceability
energy = seed.energy
liveness = seed.liveness

#### 3. Get the closest N songs on those features (by euclidean distance)

In [252]:
# Calculating euclidean distance for every song with respect to the seed song
distance = []

for i in genre_data.index:
#     print(i)
    d = np.sqrt((genre_data.loc[i,'acousticness']-acousticness)**2 + (genre_data.loc[i,'danceability']-danceability)**2 + (genre_data.loc[i,'energy']-energy)**2 + (genre_data.loc[i,'liveness']-liveness)**2)
    distance.append(d)
    
distance

[0.86549681934558387,
 0.64520509386070035,
 0.29560506014627747,
 0.25413832671721365,
 0.18667302860965107,
 0.26788610865434681,
 0.56458724531414761,
 0.46621308750855883,
 0.2868300120303332,
 0.14976604469976512,
 0.13039112363217564,
 0.32935866103844025,
 0.12076813467755369,
 0.37076104664536252,
 0.36851294723377037,
 0.070497559075709426,
 0.36904399880842953,
 0.20728382679700544,
 0.58866860803876953,
 0.3388789345923488,
 0.54862150317720237,
 0.19178440403181127,
 0.21037654105855014,
 0.56886933261516859,
 0.26353396236583765,
 0.36147828829341039,
 0.63129511302274466,
 0.099957891555003592,
 0.44350392350806206,
 0.17334633373727648,
 0.19135868855169727,
 0.20743190775291223,
 0.085053633406401305,
 0.31541292972809098,
 0.44024361773784448,
 0.049141760199770956,
 0.13222662634097812,
 0.22769293814257191,
 0.1505331783019935,
 0.13901300838598676,
 0.19756282104628572,
 0.25202326503574846,
 0.10158658124398941,
 0.1811991947730017,
 0.1982184364429774,
 0.28767304

In [253]:
genre_data = genre_data.reset_index(drop=True)
genre_data['distance'] = distance
genre_data

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
0,0.782129,0.595939,0.047149,0.299,0.000000,0.727273,0.123370,0.827932,1.0,0.027198,0.408234,0.6,0.356356,pop,1bhUWB0zJMIKr9yVPrkEuI,0.865497
1,0.635542,0.776650,0.041680,0.688,0.000000,0.363636,0.073621,0.861078,1.0,0.086970,0.386837,0.8,0.434434,pop,42CeaId2XNlxugDvyqHfDf,0.645205
2,0.053614,0.597970,0.048850,0.731,0.000000,0.181818,0.308927,0.846762,1.0,0.089762,0.377790,0.8,0.191191,pop,3YU6vJbjYUG0tiJyXf9x5V,0.295605
3,0.078012,0.581726,0.040543,0.543,0.000000,0.727273,0.199599,0.864929,0.0,0.041158,0.618488,0.8,0.308308,pop,75ZvA4QfFiZvzhj2xkaWAh,0.254138
4,0.011747,0.850761,0.044565,0.771,0.000000,0.090909,0.085557,0.893782,1.0,0.252327,0.756186,0.8,0.405405,pop,2Xqd0wUttjueBfdcltADOv,0.186673
5,0.128514,0.588832,0.039626,0.531,0.000127,0.454545,0.143430,0.841456,0.0,0.080248,0.686690,0.8,0.141141,pop,1OmcAT5Y8eg5bUPv9qJT4R,0.267886
6,0.000643,0.380711,0.031975,0.930,0.000000,0.181818,0.318957,0.915154,1.0,0.058014,0.632275,0.8,0.491491,pop,33SNO8AaciGbNaQFkxvPrW,0.564587
7,0.447791,0.722843,0.043675,0.510,0.000025,0.727273,0.112337,0.813247,1.0,0.065460,0.528446,0.8,0.268268,pop,2Za1AlJNvksouPPWbXpR2X,0.466213
8,0.275100,0.873096,0.034364,0.561,0.000000,0.545455,0.108325,0.857559,0.0,0.191313,0.657458,0.8,0.354354,pop,7r6LNJT2LqpLpEyZQJPygt,0.286830
9,0.011245,0.944162,0.034196,0.673,0.000000,0.909091,0.138415,0.828871,0.0,0.075801,0.580102,0.8,0.219219,pop,4EsYkJjHKMejYLp54woB9c,0.149766


In [254]:
genre_data = genre_data.sort_values(by=['distance'], ascending=True)

In [255]:
playlist = genre_data.iloc[:N,:]
print(playlist.shape)
playlist

(50, 16)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
236,0.002671,0.813198,0.045395,0.604,0.0,0.090909,0.159478,0.819401,1.0,0.437435,0.507504,0.8,0.359359,pop,3hmrgEKcLEQi0rlk3mJfZ7,0.0
35,0.003635,0.846701,0.036254,0.618,2.7e-05,0.727273,0.126379,0.822626,0.0,0.238883,0.583992,0.8,0.553554,pop,2ya9d4B06EQ8Fpj5SSqiTH,0.049142
426,0.002329,0.830457,0.042096,0.607,0.0,1.0,0.10331,0.863768,1.0,0.072906,0.558821,0.8,0.461461,pop,6eoD4UZyqWIdSGapVI8uG0,0.058838
395,0.029418,0.825381,0.037772,0.563,0.0,0.636364,0.125376,0.833128,1.0,0.27818,0.356736,0.8,0.761762,pop,60ZMdbcC5wwmzbGr5XkDHd,0.060892
57,0.051004,0.784772,0.033475,0.624,0.0,0.090909,0.146439,0.87263,1.0,0.0394,0.58455,0.8,0.154154,pop,7IAGWiFEtS9MpDPS8EMJJN,0.060944
233,0.018273,0.834518,0.033458,0.575,4.4e-05,0.545455,0.109328,0.879116,0.0,0.047983,0.472852,0.8,0.286286,pop,1eN1K7YPRButXU7Mq0V58N,0.063671
15,0.01496,0.748223,0.040431,0.624,0.0,0.454545,0.145436,0.867158,0.0,0.205791,0.739336,0.8,0.803804,pop,4XP6qLtHI0n9UjIPTsuqe8,0.070498
47,0.009277,0.852792,0.02661,0.65,0.0,0.090909,0.201605,0.840037,1.0,0.270941,0.627324,0.8,0.527528,pop,5b06JZjts6dgyyuY1m7roq,0.074175
393,0.009277,0.774619,0.037945,0.533,0.0,0.090909,0.157472,0.840276,1.0,0.080869,0.433955,0.8,0.868869,pop,4qib9Vp9ZBukba1uOOWWQ7,0.081099
364,0.047691,0.791878,0.036417,0.658,0.0,0.090909,0.195587,0.893837,0.0,0.154085,0.61049,0.8,0.405405,pop,70mkjE0eVDfuuVKXX7KZXB,0.081861
