### 1. Importing libraries

In [290]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsRegressor

%matplotlib inline

### 2. Loading the song dataset

In [291]:
songs = pd.read_csv('user_spotify_v3.json.tracks1.csv')
print(songs.shape)
songs.head(10)

(109233, 15)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p
7,0.316,0.661,212120,0.715,0.0,5,0.178,-5.651,0,0.119,148.027,4,0.411,,2amzBJRBPOGszBem4FedfE
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ


In [292]:
songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0,109233.0
mean,0.380904,0.552116,269176.0,0.543602,0.148816,5.229436,0.194541,-10.618803,0.664369,0.151028,117.32221,3.850732,0.440351
std,0.363885,0.174904,246220.8,0.286918,0.301031,3.561608,0.167634,7.077973,0.472213,0.245035,30.634027,0.568357,0.252394
min,0.0,0.0,1155.0,0.0,0.0,0.0,0.0,-58.555,0.0,0.0,0.0,0.0,0.0
25%,0.0293,0.443,189493.0,0.303,0.0,2.0,0.0968,-13.548,0.0,0.0354,94.995,4.0,0.236
50%,0.248,0.574,223962.0,0.576,0.000109,5.0,0.125,-8.279,1.0,0.0488,116.538,4.0,0.417
75%,0.753,0.678,272000.0,0.792,0.0496,8.0,0.237,-5.577,1.0,0.103,135.522,4.0,0.632
max,0.996,0.985,5925082.0,1.0,1.0,11.0,0.997,1.974,1.0,0.969,232.69,5.0,0.999


In [293]:
# Removing duplicate rows and rows with null values
print("Original shape: {}".format(songs.shape))
songs.drop_duplicates(inplace=True)
songs.dropna(how='any', inplace=True)
print("Shape of dataset after modifications: {}".format(songs.shape))

Original shape: (109233, 15)
Shape of dataset after modifications: (56452, 15)


In [294]:
# Getting genres (taking the first genre of the list)
genre = []

for s in songs['genres']:
    g = s[:s.find(" ")]
    genre.append(g)
#     print(s)
    
songs['genre'] = genre

In [295]:
songs.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genres,id,genre
0,0.779,0.587,259550,0.299,0.0,8,0.123,-7.365,1,0.0263,94.992,3,0.356,pop dance pop pop pop rap post-teen pop r&b,1bhUWB0zJMIKr9yVPrkEuI,pop
1,0.245,0.638,205748,0.658,4e-06,3,0.0919,-6.318,1,0.0456,105.076,4,0.33,dance pop edm pop tropical house uk funky danc...,2xmrfQpmS2iJExTlklLoAL,dance
2,0.633,0.765,229573,0.688,0.0,4,0.0734,-5.566,1,0.0841,90.013,4,0.434,pop rap rap,42CeaId2XNlxugDvyqHfDf,pop
3,0.129,0.72,197993,0.807,0.0,11,0.183,-4.59,0,0.0432,124.946,4,0.305,dance pop pop post-teen pop brostep edm progre...,0tBbt8CrmxbjRP0pueQkyU,dance
4,0.00413,0.653,202805,0.718,0.0,3,0.0537,-5.232,0,0.213,82.034,4,0.216,hip hop pop rap rap southern hip hop trap musi...,0OI7AFifLSoGzpb8bdBLLV,hip
5,0.0835,0.648,190643,0.608,0.0,8,0.105,-5.16,1,0.0587,126.12,4,0.488,dance pop pop pop christmas,7eFmN6wnsb7WowRKAqRFfs,dance
6,0.032,0.667,174800,0.726,0.0,8,0.0745,-4.172,1,0.054,103.001,4,0.77,dance pop pop post-teen pop big room dance pop...,5Gu0PDLN4YJeW75PpBSg9p,dance
8,0.0233,0.845,187521,0.709,0.0,10,0.094,-4.547,0,0.0714,98.062,4,0.62,dance pop pop pop rap post-teen pop pop rap ra...,2z4pcBLQXF2BXKFvd0BuB6,dance
9,0.398,0.751,199095,0.579,2.3e-05,2,0.133,-4.036,1,0.0321,105.031,4,0.349,dance pop pop post-teen pop latin latin hip ho...,3whrwq4DtvucphBPUogRuJ,dance
10,0.413,0.827,187250,0.419,0.0,10,0.115,-10.329,0,0.112,119.974,4,0.227,underground hip hop,3al2hpm92xE0pBalqWQHdD,underground


In [296]:
songs = songs.reset_index(drop=True)

### Selecting four features to define similarity: acousticness, danceability, energy and liveness

We need to take a couple of steps:
1. Scale all the data
2. Select a random song for a particular genre 
3. Get the closest X songs on those features (by euclidean distance)




#### 1. Scaling the data

In [297]:
# Getting features
features = songs.iloc[:,:(songs.shape[1]-3)]


# Scaling featues
scaler = MinMaxScaler().fit(features)
data = scaler.transform(features)
data = pd.DataFrame(data, columns= features.columns)
data['genre'] = songs['genre']
data['id'] = songs['id']

In [298]:
data.head(100)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id
0,0.782129,0.595939,0.047149,0.299,0.000000,0.727273,0.123370,0.827932,1.0,0.027198,0.408234,0.6,0.356356,pop,1bhUWB0zJMIKr9yVPrkEuI
1,0.245984,0.647716,0.037332,0.658,0.000004,0.272727,0.092177,0.847222,1.0,0.047156,0.451571,0.8,0.330330,dance,2xmrfQpmS2iJExTlklLoAL
2,0.635542,0.776650,0.041680,0.688,0.000000,0.363636,0.073621,0.861078,1.0,0.086970,0.386837,0.8,0.434434,pop,42CeaId2XNlxugDvyqHfDf
3,0.129518,0.730964,0.035917,0.807,0.000000,1.000000,0.183551,0.879060,0.0,0.044674,0.536963,0.8,0.305305,dance,0tBbt8CrmxbjRP0pueQkyU
4,0.004147,0.662944,0.036795,0.718,0.000000,0.272727,0.053862,0.867232,0.0,0.220269,0.352546,0.8,0.216216,hip,0OI7AFifLSoGzpb8bdBLLV
5,0.083835,0.657868,0.034576,0.608,0.000000,0.727273,0.105316,0.868558,1.0,0.060703,0.542009,0.8,0.488488,dance,7eFmN6wnsb7WowRKAqRFfs
6,0.032129,0.677157,0.031685,0.726,0.000000,0.727273,0.074724,0.886762,1.0,0.055843,0.442653,0.8,0.770771,dance,5Gu0PDLN4YJeW75PpBSg9p
7,0.023394,0.857868,0.034006,0.709,0.000000,0.909091,0.094283,0.879853,0.0,0.073837,0.421428,0.8,0.620621,dance,2z4pcBLQXF2BXKFvd0BuB6
8,0.399598,0.762437,0.036118,0.579,0.000023,0.181818,0.133400,0.889268,1.0,0.033195,0.451377,0.8,0.349349,dance,3whrwq4DtvucphBPUogRuJ
9,0.414659,0.839594,0.033957,0.419,0.000000,0.909091,0.115346,0.773321,0.0,0.115822,0.515596,0.8,0.227227,underground,3al2hpm92xE0pBalqWQHdD


#### 2. Select a random song from the given genre

In [299]:
print("Number of genres available: {}".format(len(data['genre'].unique())))
data['genre'].unique()

Number of genres available: 716


array(['pop', 'dance', 'hip', 'underground', 'bmore', 'dwn', 'latin',
       'rap', 'big', 'brostep', 'edm', 'detroit', 'drill', 'post-teen',
       'dirty', 'deep', 'east', 'ra', 'crunk', 'bass', 'indie',
       'chillwave', 'danish', 'canadian', 'irish', 'blues-rock',
       'alternative', 'alt-indie', 'escape', 'modern', 'emo', 'garage',
       'melodic', 'glam', 'folk-pop', 'contemporary', 'australian', 'lift',
       'christian', 'chicago', 'bachata', 'trap', 'reggaeton', 'cumbia',
       'reggaeto', 'colombian', 'aussietronica', 'house', 'chamber', 'boy',
       'acoustic', 'vapor', 'brooklyn', 'progressive', 'quebecoi',
       'indiecoustic', 'viral', 'channel', 'dreamo', 'folk-po', 'focu',
       'classify', 'compositional', 'new', 'ambient', 'soul', 'nu', 'bow',
       'scorecore', 'focus', 'austindie', 'funk', 'neo', 'folk', 'freak',
       'portland', 'michigan', 'chanson', 'anti-folk', 'vancouver',
       'norwegian', 'seattle', 'electroclash', 'bay', 'adult', 'tropical',
 

In [300]:
selected_genre = 'hip'
N = 10

genre_data = data[data.genre==selected_genre]


ind = data[data.genre==selected_genre].index
r = np.random.choice(ind,1)[0]


In [301]:
seed = data.iloc[r,:]
seed

acousticness                      0.118474
danceability                      0.679188
duration_ms                      0.0570461
energy                                0.49
instrumentalness                  2.63e-05
key                                      1
liveness                          0.592778
loudness                          0.819825
mode                                     0
speechiness                       0.438469
tempo                             0.636405
time_signature                         0.8
valence                           0.258258
genre                                  hip
id                  3DFjTEueCBYqg7YE05eUJP
Name: 47558, dtype: object

In [302]:
# Getting feature values for our seed song
acousticness = seed.acousticness
danceability = seed.danceability
energy = seed.energy
liveness = seed.liveness

#### 3. Get the closest N songs on those features (by euclidean distance)

In [303]:
# Calculating euclidean distance for every song with respect to the seed song
distance = []

for i in genre_data.index:
#     print(i)
    d = np.sqrt((genre_data.loc[i,'acousticness']-acousticness)**2 + (genre_data.loc[i,'danceability']-danceability)**2 + (genre_data.loc[i,'energy']-energy)**2 + (genre_data.loc[i,'liveness']-liveness)**2)
    distance.append(d)
    
distance

[0.59644769566172307,
 0.61979187585450535,
 0.37768696043819994,
 0.49843560852592306,
 0.5153417097485633,
 0.46225083653822974,
 0.94826892189746992,
 0.44972138964520264,
 0.39464180823503359,
 0.42145877414732058,
 0.39309825838246171,
 0.36808859874060956,
 0.62034689830360534,
 0.49602671537480875,
 0.61412009338507889,
 0.53951990398928318,
 0.64390846005622682,
 0.52030401595752451,
 0.80687082500083684,
 0.46659297933294752,
 0.48390858579789298,
 0.32681650701324488,
 0.38813942358010439,
 0.51290289519732191,
 0.37253579273755949,
 0.57635844458091601,
 0.47994412112167167,
 0.42885534303410694,
 0.43911486486192769,
 0.45589878077567225,
 0.53275029809324981,
 0.56474699222150415,
 0.64476731481203509,
 0.57472443023361608,
 0.53045510435315979,
 0.38042844550397215,
 0.49217976999292695,
 0.49303066798414458,
 0.50690442881404441,
 0.4834311665372123,
 0.56997023056999274,
 0.49271528270149328,
 0.56358804198096091,
 0.29453824373255133,
 0.40565444562242242,
 0.580083484

In [304]:
genre_data = genre_data.reset_index(drop=True)
genre_data['distance'] = distance
genre_data

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
0,0.004147,0.662944,0.036795,0.718,0.000000,0.272727,0.053862,0.867232,0.0,0.220269,0.352546,0.8,0.216216,hip,0OI7AFifLSoGzpb8bdBLLV,0.596448
1,0.112450,0.709645,0.044285,0.890,0.000000,0.545455,0.120361,0.898683,1.0,0.233713,0.722291,0.8,0.721722,hip,1baLpIuaLdNehFwV5N3WUm,0.619792
2,0.181727,0.644670,0.039741,0.777,0.000000,0.090909,0.358074,0.852750,1.0,0.238883,0.550475,0.8,0.177177,hip,0RyA3o15NOLJYtm9NlDu5c,0.377687
3,0.291165,0.810152,0.038728,0.582,0.000000,0.909091,0.153460,0.827655,1.0,0.106515,0.541815,0.8,0.766767,hip,6PGoSes0D9eUDeeAafB2As,0.498436
4,0.019177,0.699492,0.037685,0.531,0.000000,0.000000,0.089168,0.828521,0.0,0.247156,0.601113,0.8,0.366366,hip,5QZfSiRJhIgDlolnAK8MQF,0.515342
5,0.095984,0.684264,0.037675,0.536,0.000045,0.363636,0.133400,0.817024,1.0,0.088418,0.369303,0.8,0.452452,hip,6Gd123r71KDdpH8JRdYvrh,0.462251
6,0.862450,0.389848,0.041409,0.633,0.000000,0.454545,0.101304,0.832004,0.0,0.137539,0.538055,0.8,0.360360,hip,05pdoheuKPSotkjMgIVX6I,0.948269
7,0.498996,0.661929,0.046078,0.477,0.000004,0.818182,0.354062,0.807333,1.0,0.059359,0.524148,0.8,0.443443,hip,0Iv5zus2xWVY8fGbMovMGn,0.449721
8,0.010944,0.475127,0.055591,0.800,0.000000,0.090909,0.512538,0.840700,1.0,0.156153,0.398552,1.0,0.093493,hip,1IjxCFAyR1ysajk10iHsKh,0.394642
9,0.000419,0.850761,0.040986,0.475,0.714000,0.363636,0.226680,0.862202,0.0,0.037229,0.456474,0.8,0.711712,hip,0eEgMbSzOHmkOeVuNC3E0k,0.421459


In [305]:
genre_data = genre_data.sort_values(by=['distance'], ascending=True)

In [306]:
playlist = genre_data.iloc[:N,:]
print(playlist.shape)
playlist

(10, 16)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre,id,distance
212,0.118474,0.679188,0.057046,0.49,2.6e-05,1.0,0.592778,0.819825,0.0,0.438469,0.636405,0.8,0.258258,hip,3DFjTEueCBYqg7YE05eUJP,0.0
136,0.119478,0.690355,0.057046,0.489,2.9e-05,1.0,0.607823,0.819235,0.0,0.446743,0.318192,0.8,0.266266,hip,5ujh1I7NZH5agbwf7Hp8Hc,0.01879
202,0.119478,0.690355,0.057046,0.489,2.9e-05,1.0,0.607823,0.819235,0.0,0.446743,0.318192,0.8,0.266266,hip,41UYoaW5ErVF7bH4wuXwvo,0.01879
220,0.011847,0.809137,0.055657,0.521,0.0,0.090909,0.565697,0.812363,1.0,0.210962,0.446882,0.8,0.391391,hip,6lVJb47gQEh3PV585qgRoy,0.173062
61,0.03745,0.714721,0.080453,0.606,0.0,0.636364,0.735206,0.80737,1.0,0.358842,0.626714,0.8,0.627628,hip,21FjKQQHLuF5jMw5F83Gb9,0.203885
91,0.228916,0.660914,0.057893,0.616,0.0,0.818182,0.71013,0.826845,0.0,0.309204,0.357059,0.8,0.518519,hip,2J2JIGPDrPip1reebfz2BL,0.205375
201,0.001888,0.702538,0.044426,0.661,0.0,0.636364,0.678034,0.874325,1.0,0.127198,0.354459,0.8,0.703704,hip,04QTusNVhoUHOx7L9jHRHZ,0.225049
228,0.037751,0.530964,0.040512,0.592,0.0,0.090909,0.477432,0.848181,1.0,0.500517,0.337303,1.0,0.880881,hip,3Eq7yD58dIXqOgw1j7NFhY,0.228462
132,0.114458,0.772589,0.047027,0.696,0.0,1.0,0.560682,0.856564,0.0,0.249224,0.494035,0.8,0.272272,hip,3u9HxfcMCFYwJ2R0nkpDWV,0.228486
98,0.001376,0.701523,0.044429,0.653,0.0,0.636364,0.714142,0.873791,1.0,0.094105,0.354601,0.8,0.702703,hip,7kMOzDgfcS6qXQHvfXfByU,0.235604
