# Predicting Moods 
**Date: June 8, 2018**

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier

from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import clone


In [2]:
df = pickle.load(open('audio_lyrics_moods_new.pickle', 'rb'))

In [3]:
df.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,lyrics_features,moods
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,oppa gangnam style gangnam style najeneun ttas...,[energetic]
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,parti rock yeah woo let s parti rock hous toni...,"[cocky, happy]"
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,alagamun lan weh wakun heya hanun gon alagamun...,"[energetic, happy]"
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [4]:
np.random.seed(100)

## Audio

In [21]:
audio_moods = df.drop('lyrics_features', axis = 1)
audio_moods.dropna(how = 'any', inplace = True)
audio_moods.reset_index(drop = True, inplace = True)
audio_moods.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,moods
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,[energetic]
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,[happy]
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,"[cocky, happy]"
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,"[energetic, happy]"
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,[energetic]


In [22]:
mlb_audio = MultiLabelBinarizer()
y_audio = mlb_audio.fit_transform(audio_moods['moods'])

In [23]:
X_audio = audio_moods.drop('moods', axis = 1)

In [24]:
audio_train_test_list = train_test_split(X_audio, y_audio, test_size = 0.33)

In [25]:
audio_rfc = RandomForestClassifier(n_estimators= 100, class_weight='balanced')
audio_rfc.fit(audio_train_test_list[0], audio_train_test_list[2])

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [26]:
audio_rfc_prediction = audio_rfc.predict(audio_train_test_list[1])
print(classification_report(audio_train_test_list[3], audio_rfc_prediction))

             precision    recall  f1-score   support

          0       0.61      0.08      0.15      1570
          1       0.68      0.14      0.24      1998
          2       0.61      0.08      0.14      1650
          3       0.60      0.24      0.34      3265
          4       0.59      0.26      0.36      2275
          5       0.62      0.04      0.07      1878
          6       0.67      0.15      0.25      1702

avg / total       0.62      0.16      0.24     14338



In [27]:
estimator = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced', n_estimators=100),
                                n_jobs = 2)

audio_chain = ClassifierChain(estimator, 
                              order=[4, 5, 6, 1, 3, 2, 0])

audio_chain.fit(audio_train_test_list[0], audio_train_test_list[2])
audio_chain_prediction = audio_chain.predict(audio_train_test_list[1])

print(classification_report(audio_train_test_list[3], audio_chain_prediction))

             precision    recall  f1-score   support

          0       0.39      0.42      0.40      1570
          1       0.57      0.30      0.39      1998
          2       0.45      0.21      0.29      1650
          3       0.49      0.59      0.53      3265
          4       0.58      0.26      0.36      2275
          5       0.53      0.07      0.12      1878
          6       0.67      0.18      0.28      1702

avg / total       0.53      0.32      0.36     14338



In [28]:
pickle.dump(audio_rfc, open('audio_predict_moods.rfc.pickle', 'wb'))
pickle.dump(audio_chain, open('audio_predict_moods.chain.pickle', 'wb'))

## Lyrics

In [10]:
lyrics_moods = df.loc[df['lyrics_features'] != "", ['lyrics_features', 'moods']].copy()

lyrics_moods.reset_index(inplace = True)

print(lyrics_moods.shape)
lyrics_moods.head()

(20930, 3)


Unnamed: 0,index,lyrics_features,moods
0,0,oppa gangnam style gangnam style najeneun ttas...,[energetic]
1,1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,2,parti rock yeah woo let s parti rock hous toni...,"[cocky, happy]"
3,3,alagamun lan weh wakun heya hanun gon alagamun...,"[energetic, happy]"
4,4,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [11]:
mlb_lyrics = MultiLabelBinarizer()

y_lyrics = mlb_lyrics.fit_transform(lyrics_moods['moods'])

In [12]:
lyrics_train_test_list = train_test_split(lyrics_moods['lyrics_features'], y_lyrics, test_size = 0.3)

In [15]:
lyrics_rfc = Pipeline([ ('tf_idf', TfidfVectorizer()), 
                         ('clf', RandomForestClassifier(class_weight='balanced', n_estimators=100)) 
                        ])

lyrics_rfc.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])
lyrics_rfc_prediction = lyrics_rfc.predict(lyrics_train_test_list[1])

print(classification_report(lyrics_train_test_list[3], lyrics_rfc_prediction))

             precision    recall  f1-score   support

          0       0.45      0.02      0.03      1305
          1       0.42      0.02      0.03      1066
          2       0.44      0.05      0.09      1105
          3       0.61      0.08      0.14      2125
          4       0.53      0.03      0.05      1437
          5       0.54      0.05      0.10      1050
          6       0.31      0.02      0.03       825

avg / total       0.49      0.04      0.08      8913



In [14]:
lyrics_chain = Pipeline([ ('tf_idf', TfidfVectorizer()), 
                         ('clf', ClassifierChain(RandomForestClassifier(class_weight='balanced', n_estimators=100), 
                                                 order=[4, 5, 6, 1, 3, 2, 0])) 
                        ])

lyrics_chain.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])
lyrics_chain_prediction = lyrics_chain.predict(lyrics_train_test_list[1])

print(classification_report(lyrics_train_test_list[3], lyrics_chain_prediction))

             precision    recall  f1-score   support

          0       0.53      0.06      0.10      1305
          1       0.45      0.02      0.04      1066
          2       0.37      0.10      0.16      1105
          3       0.56      0.23      0.32      2125
          4       0.52      0.04      0.07      1437
          5       0.57      0.10      0.17      1050
          6       0.32      0.02      0.03       825

avg / total       0.49      0.10      0.15      8913



In [16]:
pickle.dump(lyrics_rfc, open('lyrics_predict_moods.rfc.pickle', 'wb'))
pickle.dump(lyrics_chain, open('lyrics_predict_moods.chain.pickle', 'wb'))