Music Recommendation System
===========================
**Predict genres**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_song_list = pd.read_json('../MasterSongList.json')
df_song_list['genres'] = df_song_list['genres'].apply(''.join)
df_song_list['genres'] = df_song_list['genres'].map(lambda x: x.split(':')[0] if len(x) > 0 else np.nan)
df_song_list.loc[:, 'moods'] = df_song_list.loc[:, 'moods'].apply(lambda x: x if len(x) > 0 else np.nan)
df_moods = df_song_list.loc[:, 'moods'].str.join(',').str.get_dummies(sep=',')

In [3]:
feature_col_names = [
                        'key',
                        'energy',
                        'liveliness',
                        'tempo',
                        'speechiness',
                        'acousticness',
                        'instrumentalness',
                        'time_signature',
                        'duration',
                        'loudness',
                        'valence',
                        'danceability',
                        'mode',
                        'time_signature_confidence',
                        'tempo_confidence',
                        'key_confidence',
                        'mode_confidence'
                    ]
df_audio_features = pd.DataFrame(df_song_list.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
df_audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [4]:
df_audio_features_moods = pd.concat([df_audio_features, df_moods, df_song_list['genres']], axis=1)
df_audio_features_moods.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,...,sexual,soothing,spacey,sprightly,sweet,trashy,trippy,visceral,warm,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,...,0,0,0,0,0,0,0,0,0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,...,0,0,0,0,0,0,0,0,0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,...,0,0,0,0,0,0,0,0,0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,...,0,0,0,0,0,0,0,0,0,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,...,0,0,0,0,0,0,0,0,0,reggaeton


In [5]:
df_audio_features_genres = pd.concat([df_audio_features, df_song_list['genres']], axis=1)
df_audio_features_genres.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


In [6]:
df_audio_features_genres.genres.value_counts()

rock                       7485
rap                        2931
r&b                        2721
dance                      2391
jazz                       2295
indie                      2089
electronica                1476
latin                      1312
country                    1284
singer-songwriter          1189
classical                  1092
blues & blues rock          902
pop                         901
reggae & ska                754
funk                        566
oldies                      548
folk                        487
international/world         462
int'l                       329
dubstep & drum 'n' bass     318
bluegrass                   302
children's                  260
film scores                 245
christian                   236
easy listening              196
reggaeton                   127
showtunes                   105
nature sounds                34
hawaiian                     20
Name: genres, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def build_genres_model(X, y):
    knn = KNeighborsClassifier()
    lg  = LogisticRegression()
    svm = SVC(probability=True)
    nb  = GaussianNB()
    clfs = [
        ('knn', knn),
        ('lg' , lg),
        ('svm', svm),
        ('nb' , nb)
    ]
    
    voting_cls = VotingClassifier(clfs, voting='soft', n_jobs=4)
    scaler = StandardScaler()
    
    steps = [
        ('scaler', scaler),
        ('voting_cls',voting_cls)
    ]
    return Pipeline(steps=steps)
    
def train_genres_model(X, y):
    pass
def test_genres_model(X, y):
    pass
df_audio_features_genres.dropna(inplace=True)
X = df_audio_features_genres.drop('genres', axis=1)
y = df_audio_features_genres['genres']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
vtcls = build_genres_model(df_audio_features_genres, 'genres')
vtcls.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('voting_cls', VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lg'...GaussianNB(priors=None))],
         flatten_transform=None, n_jobs=4, voting='soft', weights=None))])

In [8]:
from sklearn.metrics import classification_report
knn = KNeighborsClassifier()
lg  = LogisticRegression()
svm = SVC(probability=True)
nb  = GaussianNB()
clfs = [
    ('knn', knn),
    ('lg' , lg),
    ('svm', svm),
    ('nb' , nb)
]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for name, estimator in clfs:
    estimator.fit(X_train_scaled, y_train)

In [None]:
clfs_plus = clfs + [('voting_cls', vtcls)]

for name, estimator in clfs_plus:
    print("estimator: {}".format(name))
    print(classification_report(y_test, estimator.predict(X_test_scaled)))

In [None]:
 print(classification_report(y_test, vtcls.predict(X_test)))