Music Recommendation System
===========================
**Predict genres**

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
df_song_list = pd.read_json('../MasterSongList.json')
df_song_list['genres'] = df_song_list['genres'].apply(''.join)
df_song_list['genres'] = df_song_list['genres'].map(lambda x: x.split(':')[0] if len(x) > 0 else np.nan)
df_song_list.loc[:, 'moods'] = df_song_list.loc[:, 'moods'].apply(lambda x: x if len(x) > 0 else np.nan)
df_moods = df_song_list.loc[:, 'moods'].str.join(',').str.get_dummies(sep=',')
df_song_list.loc[df_song_list['genres'] == 'blues & blues rock', 'genres'] = 'blues'

In [12]:
feature_col_names = [
                        'key',
                        'energy',
                        'liveliness',
                        'tempo',
                        'speechiness',
                        'acousticness',
                        'instrumentalness',
                        'time_signature',
                        'duration',
                        'loudness',
                        'valence',
                        'danceability',
                        'mode',
                        'time_signature_confidence',
                        'tempo_confidence',
                        'key_confidence',
                        'mode_confidence'
                    ]
df_audio_features = pd.DataFrame(df_song_list.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
df_audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [13]:
df_song_list_filtered = df_song_list.groupby('genres').filter(lambda x: len(x) > 900)
df_song_list_filtered.reset_index(inplace=True, drop=True)
df_audio_features_filtered = pd.DataFrame(df_song_list_filtered.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
df_moods_filtered = df_song_list_filtered.loc[:, 'moods'].str.join(',').str.get_dummies(sep=',')
df_audio_features_moods_filtered = pd.concat([df_audio_features_filtered, df_moods_filtered, df_song_list_filtered['genres']], axis=1)
df_audio_features_genres_filtered = pd.concat([df_audio_features_filtered, df_song_list_filtered['genres']], axis=1)
df_song_list_filtered.head()
df_audio_features_filtered.head()
# df_song_list_filtered.shape
# df_audio_features_filtered.shape
df_moods_filtered.head()
df_audio_features_moods_filtered.head()
df_audio_features_genres_filtered.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
3,8.0,0.777375,0.054104,104.946,0.029302,0.13035,0.0,1.0,4.0,228.29333,-5.112,0.525632,0.729051,0.817,0.672,0.394,0.963,r&b
4,7.0,0.585564,0.108297,120.014,0.038924,0.011707,5e-06,1.0,4.0,193.57333,-6.583,0.622176,0.781822,0.97,0.861,0.792,1.0,pop


In [14]:
df_audio_features_moods = pd.concat([df_audio_features, df_moods, df_song_list['genres']], axis=1)
df_audio_features_moods.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,...,sexual,soothing,spacey,sprightly,sweet,trashy,trippy,visceral,warm,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,...,0,0,0,0,0,0,0,0,0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,...,0,0,0,0,0,0,0,0,0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,...,0,0,0,0,0,0,0,0,0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,...,0,0,0,0,0,0,0,0,0,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,...,0,0,0,0,0,0,0,0,0,reggaeton


In [15]:
df_audio_features_genres = pd.concat([df_audio_features, df_song_list['genres']], axis=1)
df_audio_features_genres.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


In [16]:
df_audio_features_genres.genres.value_counts()

rock                       7485
rap                        2931
r&b                        2721
dance                      2391
jazz                       2295
indie                      2089
electronica                1476
latin                      1312
country                    1284
singer-songwriter          1189
classical                  1092
blues                       902
pop                         901
reggae & ska                754
funk                        566
oldies                      548
folk                        487
international/world         462
int'l                       329
dubstep & drum 'n' bass     318
bluegrass                   302
children's                  260
film scores                 245
christian                   236
easy listening              196
reggaeton                   127
showtunes                   105
nature sounds                34
hawaiian                     20
Name: genres, dtype: int64

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def build_genres_model(X, y):
    nb =  GaussianNB()
    svm = SVC(probability=True, class_weight='balanced')
    rfc  = RandomForestClassifier(class_weight='balanced')
    clfs = [
        ('nb' , nb),
        ('svm', svm),
        ('rfc', rfc)
    ]
    
    voting_cls = VotingClassifier(clfs, voting='soft', n_jobs=-1)
    scaler = MinMaxScaler()
    
    steps = [
        ('scaler', scaler),
        ('voting_cls',voting_cls)
    ]
    return Pipeline(steps=steps)
    
def train_genres_model(X, y):
    pass
def test_genres_model(X, y):
    pass

## Model Selection

## pipeline + gridSearchCV
```python
pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
```

In [64]:
from sklearn.model_selection import GridSearchCV

In [88]:
param_grids = {
    'voting_cls__voting': ['hard', 'soft'],
    'voting_cls__svm__C':[0.1,1, 10, 100, 1000],
    'voting_cls__svm__gamma':[1,0.1,0.01,0.001,0.0001],
    'voting_cls__rfc__n_estimators': [5, 10, 100],
    'voting_cls__rfc__min_samples_split': [2, 3, 4, 5, 10],
    'voting_cls__rfc__max_features': ['sqrt', 'log2', 'auto']
}

## Feature selection

In [46]:
from sklearn.feature_selection import chi2, SelectKBest
X = df_audio_features_genres.drop('genres', axis=1)
y = df_audio_features_genres['genres']
kbest = SelectKBest(k = 10)
kbest.fit(X, y)

SelectKBest(k=10, score_func=<function f_classif at 0x7fb7149cc378>)

In [60]:
kbest.get_support()
print(kbest.scores_)
print(kbest.get_support())
print(kbest.get_params())

[  3.33527448 464.06962699  18.88506472  42.91744741 373.7629087
 612.64705323 284.5312043   51.19627414  24.95778269 102.8895384
 367.45060945 211.62303198 465.11556891  52.00999985  79.26048311
 140.33772996 119.78246729]
[False  True False False  True  True  True False False  True  True  True
  True False False  True  True]
{'k': 10, 'score_func': <function f_classif at 0x7fb7149cc378>}


In [59]:
X.columns[kbest.get_support()]

Index(['energy', 'speechiness', 'acousticness', 'instrumentalness', 'loudness',
       'valence', 'danceability', 'mode', 'key_confidence', 'mode_confidence'],
      dtype='object')

In [30]:
df_audio_features_genres.dropna(inplace=True)
df_audio_features_genres['genres'].value_counts()

rock                       6435
rap                        2452
r&b                        2344
dance                      2000
jazz                       1889
indie                      1834
electronica                1249
country                    1075
singer-songwriter          1034
latin                      1032
blues                       727
pop                         689
reggae & ska                617
funk                        470
classical                   455
oldies                      454
folk                        402
international/world         311
dubstep & drum 'n' bass     272
int'l                       256
bluegrass                   213
christian                   204
children's                  196
film scores                 180
easy listening              151
reggaeton                   102
showtunes                    91
nature sounds                15
hawaiian                     12
Name: genres, dtype: int64

In [31]:
X = df_audio_features_genres.drop('genres', axis=1)
y = df_audio_features_genres['genres']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
vtcls = build_genres_model(df_audio_features_genres, 'genres')
vtcls.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('voting_cls', VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lg', LogisticR...se=0, warm_start=False))],
         flatten_transform=None, n_jobs=4, voting='soft', weights=None))])

In [65]:
from sklearn.metrics import classification_report
nb  = GaussianNB()
svm = SVC(probability=True, class_weight='balanced')
rfc = RandomForestClassifier()
clfs = [
    ('nb' , nb),
    ('svm', svm),
    ('rfc' , rfc)
]

In [32]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for name, estimator in clfs:
    estimator.fit(X_train_scaled, y_train)

In [33]:
clfs_plus = clfs + [('voting_cls', vtcls)]

for name, estimator in clfs_plus:
    print("estimator: {}".format(name))
    print(classification_report(y_test, estimator.predict(X_test_scaled)))

estimator: knn


  'precision', 'predicted', average, warn_for)


                         precision    recall  f1-score   support

              bluegrass       0.18      0.28      0.22        65
                  blues       0.14      0.26      0.19       200
             children's       0.03      0.05      0.04        55
              christian       0.01      0.02      0.01        61
              classical       0.31      0.49      0.38       144
                country       0.15      0.25      0.19       309
                  dance       0.36      0.49      0.41       625
dubstep & drum 'n' bass       0.10      0.05      0.07        84
         easy listening       0.11      0.06      0.08        47
            electronica       0.29      0.21      0.25       383
            film scores       0.14      0.09      0.11        57
                   folk       0.11      0.09      0.10       114
                   funk       0.11      0.09      0.10       140
              hawaiian        0.00      0.00      0.00         6
                  indie 

  if diff:


In [10]:
 print(classification_report(y_test, vtcls.predict(X_test)))

                         precision    recall  f1-score   support

              bluegrass       0.46      0.29      0.36        65
     blues & blues rock       0.42      0.21      0.28       200
             children's       0.00      0.00      0.00        55
              christian       0.00      0.00      0.00        61
              classical       0.38      0.69      0.49       144
                country       0.24      0.26      0.25       309
                  dance       0.42      0.54      0.48       625
dubstep & drum 'n' bass       0.71      0.06      0.11        84
         easy listening       0.15      0.09      0.11        47
            electronica       0.45      0.20      0.28       383
            film scores       0.37      0.19      0.25        57
                   folk       0.14      0.03      0.04       114
                   funk       0.21      0.09      0.12       140
              hawaiian        0.00      0.00      0.00         6
                  indie 

  if diff:


## Filtered genres (more than 900 instance)

In [34]:
df_audio_features_genres_filtered_clean = df_audio_features_genres_filtered.dropna()
X = df_audio_features_genres_filtered_clean.drop('genres', axis=1)
y = df_audio_features_genres_filtered_clean['genres']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
vtcls_filtered = build_genres_model(df_audio_features_genres_filtered_clean, 'genres')
vtcls_filtered.fit(X_train, y_train)
# df_audio_features_genres_filtered.head()

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('voting_cls', VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lg', LogisticR...se=0, warm_start=False))],
         flatten_transform=None, n_jobs=4, voting='soft', weights=None))])

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, vtcls_filtered.predict(X_test)))

                   precision    recall  f1-score   support

            blues       0.53      0.22      0.31       220
        classical       0.53      0.49      0.51       136
          country       0.28      0.16      0.21       299
            dance       0.48      0.52      0.50       590
      electronica       0.48      0.23      0.31       387
            indie       0.26      0.04      0.06       559
             jazz       0.43      0.59      0.50       550
            latin       0.40      0.25      0.31       288
              pop       0.19      0.01      0.03       219
              r&b       0.36      0.37      0.36       684
              rap       0.65      0.69      0.67       739
             rock       0.52      0.78      0.63      1973
singer-songwriter       0.32      0.25      0.28       321

      avg / total       0.45      0.48      0.44      6965



  if diff:


In [27]:
# import pickle
# pickle.dump(vtcls_filtered, open('predict_genres.pickle', 'wb'))

In [13]:
df_audio_features_genres
' '.join(df_song_list.iloc[2].lyrics_features)

'party rock yeah woo let s go party rock is in the house tonight everybody just have a good time and we gon make you loose your mind wooo everybody just have a good time party rock is in the house tonight ooww everybody just have a good time and we gon make you loose your mind yeah we just wanna see you shake that in the club party rock look a pretty girl she on my jock huh non stop when we in the spot booty on the way like she on the block wooo with a drink i gots to know tight jeans tattoos cause i m rock n roll half black half white domino gain the money out the door yoo i m runnin through these hoes like drano i got that devilish flow rock n roll no halo we party rock yeah that s the crew that i m reppin on the rise to the stop no led in our zeppelin heeeey party rock is in the house tonight wooo everybody just have a good time hey and we gon make you loose your mind everybody just have a good time let s go party rock is in the house tonight everybody just have a good time and we g

In [15]:
df_song_list.iloc[2].audio_features

[5,
 0.709932,
 0.231455,
 130.03,
 0.12174099999999999,
 0.036662,
 0.0,
 0,
 4,
 232.46104,
 -5.15,
 0.37439000000000006,
 0.7047289999999999,
 0.5650000000000001,
 0.5650000000000001,
 0.743,
 1.0]

## Using pickle to load dataframe

In [36]:
import pickle
my_database = pd.read_pickle('my_database_new.pickle')
my_database.head()

Unnamed: 0,artist,name,audio_features,lyrics_features,genres,moods
0,Merle Haggard,Workin' Man Blues,"[2, 0.419332, 0.031391999999999996, 105.352, 0...","[it, s, a, big, job, just, gettin, by, with, n...",country,[earthy]
1,DJ Center,Yes! (Featuring Zaki Ibrahim),"[5, 0.548063, 0.10377199999999999, 92.022, 0.0...",[],rap,"[happy, sad]"
2,Devendra Banhart,Freely,"[4, 0.196956, 0.10985099999999999, 140.55, 0.0...","[it, ain, t, about, a, heart, to, find, it, s,...",indie,[sad]
3,Bobby Vee,Sharing You,"[8, 0.356188, 0.169035, 110.14, 0.030354, 0.70...","[peak, billboard, position, 15, in, 1962, word...",rock,[earthy]
4,Bone Thugs-N-Harmony,Hardtimes,"[10, 0.5808869999999999, 0.13563, 140.22, 0.02...","[yeah, i, m, just, tryin, to, prepare, myself,...",rap,"[happy, sad]"


In [37]:
my_database['audio_features'].isnull().value_counts()
my_database['audio_features'].str.len().value_counts()

17    21459
0      4518
Name: audio_features, dtype: int64

In [38]:
my_database['genres'].str.len().value_counts()

4     9780
3     6553
5     5792
11    1476
7     1284
9     1092
Name: genres, dtype: int64

In [39]:
feature_col_names = [
                        'key',
                        'energy',
                        'liveliness',
                        'tempo',
                        'speechiness',
                        'acousticness',
                        'instrumentalness',
                        'time_signature',
                        'duration',
                        'loudness',
                        'valence',
                        'danceability',
                        'mode',
                        'time_signature_confidence',
                        'tempo_confidence',
                        'key_confidence',
                        'mode_confidence'
                    ]
n_df_audio_features = pd.DataFrame(my_database.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
n_df_audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,2.0,0.419332,0.031392,105.352,0.033737,0.286198,0.015986,1.0,4.0,162.12,-16.718,0.844026,0.833461,0.541,0.64,0.848,1.0
1,5.0,0.548063,0.103772,92.022,0.040763,0.146351,0.081452,0.0,4.0,259.42667,-6.125,0.580668,0.755222,0.285,0.462,0.94,0.99
2,4.0,0.196956,0.109851,140.55,0.041798,0.677967,0.496227,1.0,4.0,298.53333,-18.621,0.168069,0.338994,0.333,0.462,0.017,0.849
3,8.0,0.356188,0.169035,110.14,0.030354,0.707248,6e-06,1.0,4.0,123.10667,-13.528,0.439992,0.503739,0.272,0.294,0.255,1.0
4,10.0,0.580887,0.13563,140.22,0.027578,0.258097,0.40578,1.0,4.0,169.4,-6.654,0.791455,0.735931,0.817,0.377,0.456,1.0


In [63]:
my_database['genres'].value_counts()

rock           7485
rap            2931
r&b            2721
dance          2391
jazz           2295
indie          2089
electronica    1476
latin          1312
country        1284
classical      1092
pop             901
Name: genres, dtype: int64

In [42]:
n_df_audio_features_genres = pd.concat([my_database['genres'], n_df_audio_features], axis=1)

In [76]:
n_df_audio_features_genres.dropna(inplace=True)
X = n_df_audio_features_genres.drop(['genres', 'time_signature_confidence','tempo_confidence','key_confidence','mode_confidence'], axis=1)
y = n_df_audio_features_genres['genres']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
vtcls = build_genres_model(n_df_audio_features_genres, 'genres')

In [None]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV
vtcls_gs = GridSearchCV(vtcls, param_grid=param_grids, cv = 10, n_jobs=4, verbose=1)
vtcls_gs.fit(X_train, y_train)

Fitting 10 folds for each of 2250 candidates, totalling 22500 fits


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self._backend_args)
  if diff:
  **self

In [82]:
print(vtcls.get_params().keys())

dict_keys(['memory', 'steps', 'scaler', 'voting_cls', 'scaler__copy', 'scaler__feature_range', 'voting_cls__estimators', 'voting_cls__flatten_transform', 'voting_cls__n_jobs', 'voting_cls__voting', 'voting_cls__weights', 'voting_cls__nb', 'voting_cls__svm', 'voting_cls__rfc', 'voting_cls__nb__priors', 'voting_cls__svm__C', 'voting_cls__svm__cache_size', 'voting_cls__svm__class_weight', 'voting_cls__svm__coef0', 'voting_cls__svm__decision_function_shape', 'voting_cls__svm__degree', 'voting_cls__svm__gamma', 'voting_cls__svm__kernel', 'voting_cls__svm__max_iter', 'voting_cls__svm__probability', 'voting_cls__svm__random_state', 'voting_cls__svm__shrinking', 'voting_cls__svm__tol', 'voting_cls__svm__verbose', 'voting_cls__rfc__bootstrap', 'voting_cls__rfc__class_weight', 'voting_cls__rfc__criterion', 'voting_cls__rfc__max_depth', 'voting_cls__rfc__max_features', 'voting_cls__rfc__max_leaf_nodes', 'voting_cls__rfc__min_impurity_decrease', 'voting_cls__rfc__min_impurity_split', 'voting_cls__

In [61]:
vtcls = build_genres_model(n_df_audio_features_genres, 'genres')
vtcls.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('voting_cls', VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')), ('lg', LogisticR...se=0, warm_start=False))],
         flatten_transform=None, n_jobs=4, voting='soft', weights=None))])

In [44]:
print(n_df_audio_features_genres['genres'].value_counts())
print(n_df_audio_features_genres.shape)
print(my_database.shape)

rock           6435
rap            2452
r&b            2344
dance          2000
jazz           1889
indie          1834
electronica    1249
country        1075
latin          1032
pop             689
classical       455
Name: genres, dtype: int64
(21454, 18)
(25977, 6)


In [62]:
from sklearn.metrics import classification_report
print(classification_report(y_test, vtcls.predict(X_test)))

             precision    recall  f1-score   support

  classical       0.59      0.46      0.52       137
    country       0.25      0.17      0.20       303
      dance       0.49      0.52      0.50       575
electronica       0.44      0.21      0.29       403
      indie       0.27      0.04      0.07       536
       jazz       0.49      0.60      0.54       605
      latin       0.39      0.21      0.27       287
        pop       0.36      0.02      0.04       204
        r&b       0.34      0.32      0.33       704
        rap       0.64      0.70      0.67       729
       rock       0.54      0.80      0.64      1954

avg / total       0.47      0.50      0.46      6437



  if diff:


## Use moods label to predict genres

## Use predicted moods to predict genres

In [12]:
import pickle
predict_genres = pickle.load(open('predict_genres.pickle', 'rb'))

In [None]:
# predict_genres.predict()