In [40]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import clone

In [2]:
df = pickle.load(open('audio_lyrics_moods_new.pickle', 'rb'))

In [3]:
df.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,lyrics_features,moods
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,oppa gangnam style gangnam style najeneun ttas...,[energetic]
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,parti rock yeah woo let s parti rock hous toni...,"[cocky, happy]"
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,alagamun lan weh wakun heya hanun gon alagamun...,"[energetic, happy]"
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [4]:
np.random.seed(99)

In [26]:
def optimizing (estimator, param_grid, train_test_list, n_jobs):
    
    output = {}
    X_train, X_test, y_train, y_test = train_test_list
    
    grid = GridSearchCV(estimator, param_grid, refit = True, n_jobs = n_jobs)
    grid.fit(X_train, y_train) 
    
    output['estimator'] = grid.best_estimator_
    output['params'] = grid.best_params_
    output['prediction'] = grid.predict(X_test)
    
    return output


def estimator_searching (init_classifiers, param_grids, train_test_list, n_jobs):
    
    classifiers = {method : {} for method in init_classifiers.keys()}
    
    for method in classifiers.keys():
        
        estimator = init_classifiers [method]
        param_grid = param_grids [method]
        
        classifiers[method] = optimizing(estimator, param_grid, train_test_list, n_jobs)
    
    return classifiers

def show_selected_params (classifiers, train_test_list):
        
    y_true = train_test_list[3]
        
    for model, result in classifiers.items():
        print("    Classification report of {} model:".format(model)) 
        print(classification_report(y_true, result['prediction']))
        print(" ...with selected params: {} \n".format(result['params']))

    print("\n")

## Audio

In [6]:
audio_moods = df.drop('lyrics_features', axis = 1)
audio_moods.dropna(how = 'any', inplace = True)
audio_moods.reset_index(drop = True, inplace = True)
audio_moods.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,moods
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,[energetic]
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,[happy]
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,"[cocky, happy]"
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,"[energetic, happy]"
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,[energetic]


In [7]:
mlb_audio = MultiLabelBinarizer()

In [8]:
y_audio = mlb_audio.fit_transform(audio_moods['moods'])
y_audio

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [9]:
y_audio.shape

(30296, 7)

In [10]:
mlb_audio.classes_

array(['cocky', 'earthy', 'energetic', 'happy', 'sad', 'seductive',
       'spacey'], dtype=object)

In [11]:
X_1 = audio_moods.drop('moods', axis = 1)
X_1.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [12]:
X_audio = np.array(X_1)
X_audio.shape

(30296, 17)

In [13]:
audio_train_test_list = train_test_split(X_audio, y_audio, test_size = 0.33)

----------------------------------

In [14]:
rfc = RandomForestClassifier(n_estimators= 100, class_weight='balanced')
rfc.fit(audio_train_test_list[0], audio_train_test_list[2])

rfc_prediction = rfc.predict(audio_train_test_list[1])

In [15]:
print(classification_report(audio_train_test_list[3], rfc_prediction))

             precision    recall  f1-score   support

          0       0.53      0.06      0.11      1611
          1       0.67      0.14      0.24      1905
          2       0.62      0.07      0.12      1661
          3       0.59      0.25      0.36      3225
          4       0.61      0.25      0.35      2279
          5       0.51      0.04      0.07      1945
          6       0.63      0.15      0.24      1690

avg / total       0.60      0.15      0.23     14316



In [16]:
import pickle

pickle.dump(rfc, open('audio_predict_moods.rfc.pickle', 'wb'))

------------------------------

In [19]:
audio_init_classifiers = {
        'rfc': RandomForestClassifier(class_weight='balanced')
    }

audio_param_grids = {    
    'rfc': 
    {
        'n_estimators': [5, 10, 50, 100],
        'min_samples_split': [2, 3, 4, 5, 10],
        'max_features': ['sqrt', 'log2', 'auto']
    }
}

In [20]:
audio_rfc = estimator_searching(audio_init_classifiers, audio_param_grids, audio_train_test_list, n_jobs=2)

In [27]:
show_selected_params(audio_rfc, audio_train_test_list)

    Classification report of rfc model:
             precision    recall  f1-score   support

          0       0.46      0.16      0.24      1611
          1       0.54      0.21      0.30      1905
          2       0.41      0.16      0.23      1661
          3       0.51      0.28      0.36      3225
          4       0.51      0.30      0.37      2279
          5       0.38      0.13      0.19      1945
          6       0.50      0.24      0.33      1690

avg / total       0.48      0.22      0.30     14316

 ...with selected params: {'max_features': 'auto', 'min_samples_split': 4, 'n_estimators': 10} 





-----------------

In [29]:
mlb_audio.classes_

array(['cocky', 'earthy', 'energetic', 'happy', 'sad', 'seductive',
       'spacey'], dtype=object)

In [33]:
estimator = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced', n_estimators=100),
                                n_jobs = 2)

audio_chain = ClassifierChain(estimator, 
                              order=[4, 5, 6, 0, 2, 3, 1])

In [34]:
audio_chain.fit(audio_train_test_list[0], audio_train_test_list[2])
audio_chain_prediction = audio_chain.predict(audio_train_test_list[1])

print(classification_report(audio_train_test_list[3], audio_chain_prediction))

             precision    recall  f1-score   support

          0       0.53      0.15      0.24      1611
          1       0.42      0.43      0.43      1905
          2       0.57      0.15      0.24      1661
          3       0.46      0.65      0.54      3225
          4       0.60      0.25      0.35      2279
          5       0.53      0.07      0.12      1945
          6       0.62      0.19      0.29      1690

avg / total       0.53      0.31      0.34     14316



------------------------

In [76]:
audio_chain_init_classifiers = {
        'chain_rfc': ClassifierChain(RandomForestClassifier(class_weight='balanced'))
    }

audio_chain_param_grids = {    
    'chain_rfc': 
    {
        'base_estimator__n_estimators': [5, 10, 50, 100],
        'base_estimator__max_features': ['sqrt', 'log2', 'auto'],
        'order': [[4, 5, 6, 0, 2, 3, 1], [4, 5, 6, 1, 3, 2, 0]]
    }
}

In [77]:
audio_chain_optimized = estimator_searching(audio_chain_init_classifiers, 
                                            audio_chain_param_grids, 
                                            audio_train_test_list, n_jobs=3)

In [78]:
show_selected_params(audio_chain_optimized, audio_train_test_list)

    Classification report of chain_rfc model:
             precision    recall  f1-score   support

          0       0.55      0.15      0.24      1611
          1       0.44      0.43      0.43      1905
          2       0.55      0.15      0.23      1661
          3       0.46      0.67      0.55      3225
          4       0.61      0.26      0.37      2279
          5       0.53      0.07      0.12      1945
          6       0.62      0.20      0.30      1690

avg / total       0.53      0.32      0.35     14316

 ...with selected params: {'base_estimator__max_features': 'sqrt', 'base_estimator__n_estimators': 100, 'order': [4, 5, 6, 0, 2, 3, 1]} 





In [80]:
estimator = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced', n_estimators=100),
                                n_jobs = 2)

audio_chain = ClassifierChain(estimator, 
                              order=[4, 5, 6, 1, 3, 2, 0])

In [81]:
audio_chain.fit(audio_train_test_list[0], audio_train_test_list[2])
audio_chain_prediction = audio_chain.predict(audio_train_test_list[1])

print(classification_report(audio_train_test_list[3], audio_chain_prediction))

             precision    recall  f1-score   support

          0       0.40      0.40      0.40      1611
          1       0.55      0.30      0.39      1905
          2       0.50      0.22      0.30      1661
          3       0.48      0.60      0.53      3225
          4       0.60      0.25      0.35      2279
          5       0.56      0.07      0.13      1945
          6       0.63      0.19      0.29      1690

avg / total       0.53      0.32      0.36     14316



In [79]:
audio_chain_init_classifiers = {
        'chain_logreg': ClassifierChain(LogisticRegression(class_weight='balanced'))
    }

audio_chain_param_grids = {    
    'chain_logreg': 
    {
        'base_estimator__multi_class': ['ovr'],
        'order': [[4, 5, 6, 0, 2, 3, 1], [4, 5, 6, 1, 3, 2, 0]]
    }
}

audio_chain_optimized = estimator_searching(audio_chain_init_classifiers, 
                                            audio_chain_param_grids, 
                                            audio_train_test_list, n_jobs=3)

show_selected_params(audio_chain_optimized, audio_train_test_list)

    Classification report of chain_logreg model:
             precision    recall  f1-score   support

          0       0.35      0.35      0.35      1611
          1       0.32      0.43      0.36      1905
          2       0.29      0.70      0.41      1661
          3       0.45      0.35      0.39      3225
          4       0.40      0.66      0.50      2279
          5       0.30      0.62      0.41      1945
          6       0.28      0.69      0.40      1690

avg / total       0.35      0.53      0.41     14316

 ...with selected params: {'base_estimator__multi_class': 'ovr', 'order': [4, 5, 6, 1, 3, 2, 0]} 





In [56]:
ovr_svm = OneVsRestClassifier(SVC(class_weight='balanced'))
ovr_svm.get_params().keys()

dict_keys(['estimator__C', 'estimator__cache_size', 'estimator__class_weight', 'estimator__coef0', 'estimator__decision_function_shape', 'estimator__degree', 'estimator__gamma', 'estimator__kernel', 'estimator__max_iter', 'estimator__probability', 'estimator__random_state', 'estimator__shrinking', 'estimator__tol', 'estimator__verbose', 'estimator', 'n_jobs'])

In [None]:
audio_chain_init_classifiers = {
        'chain_svm': Pipeline([('scaler', MinMaxScaler()), 
                               ('clf', ClassifierChain(ovr_svm))
                              ])
    }

audio_chain_param_grids = {    
    'chain_svm': 
    {
        'clf__base_estimator__estimator__C': [0.1, 1],
        'clf__base_estimator__estimator__gamma': [1, 0.1, 0.01, 0.001],
        'clf__order': [[4, 5, 6, 1, 3, 2, 0]]
    }
}

audio_chain_optimized = estimator_searching(audio_chain_init_classifiers, 
                                            audio_chain_param_grids, 
                                            audio_train_test_list, n_jobs=3)

show_selected_params(audio_chain_optimized, audio_train_test_list)

------------------

In [37]:
from sklearn.ensemble import VotingClassifier

audio_voter = VotingClassifier(estimators= [('chain', audio_chain), ('rfc', rfc)],
                              voting = 'soft')

In [38]:
audio_voter.fit(audio_train_test_list[0], audio_train_test_list[2])
audio_voter_prediction = audio_voter.predict(audio_train_test_list[1])

print(classification_report(audio_train_test_list[3], audio_voter_prediction))

NotImplementedError: Multilabel and multi-output classification is not supported.

## Lyrics

In [91]:
lyrics_moods = df.loc[df['lyrics_features'] != "", ['lyrics_features', 'moods']].copy()

lyrics_moods.reset_index(inplace = True)

print(lyrics_moods.shape)
lyrics_moods.head()

(20930, 3)


Unnamed: 0,index,lyrics_features,moods
0,0,oppa gangnam style gangnam style najeneun ttas...,[energetic]
1,1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,2,parti rock yeah woo let s parti rock hous toni...,"[cocky, happy]"
3,3,alagamun lan weh wakun heya hanun gon alagamun...,"[energetic, happy]"
4,4,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [62]:
mlb_lyrics = MultiLabelBinarizer()

y_lyrics = mlb_lyrics.fit_transform(lyrics_moods['moods'])

In [63]:
mlb_lyrics.classes_

array(['cocky', 'earthy', 'energetic', 'happy', 'sad', 'seductive',
       'spacey'], dtype=object)

In [64]:
y_lyrics.shape

(20930, 7)

In [65]:
lyrics_train_test_list = train_test_split(lyrics_moods['lyrics_features'], y_lyrics, test_size = 0.3)

---------------------------------------

In [66]:
lyrics_pipeline = Pipeline([ ('tf_idf', TfidfVectorizer()), 
                            ('clf', RandomForestClassifier(n_estimators=10, class_weight='balanced')) ])

In [67]:
lyrics_pipeline.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])

Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [68]:
prediction = lyrics_pipeline.predict(lyrics_train_test_list[1])

In [69]:
print(classification_report(lyrics_train_test_list[3], prediction))

             precision    recall  f1-score   support

          0       0.46      0.05      0.09      1254
          1       0.26      0.01      0.03      1080
          2       0.48      0.03      0.06      1109
          3       0.46      0.14      0.21      2145
          4       0.48      0.06      0.11      1472
          5       0.46      0.05      0.10      1089
          6       0.42      0.03      0.05       878

avg / total       0.44      0.07      0.11      9027



-----------------

In [121]:
lyrics_chain = Pipeline([ ('tf_idf', TfidfVectorizer()), 
                         ('clf', ClassifierChain(RandomForestClassifier(class_weight='balanced'), 
                                                 order=[4, 5, 6, 1, 3, 2, 0])) 
                        ])

In [122]:
lyrics_chain.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])
lyrics_chain_prediction = lyrics_chain.predict(lyrics_train_test_list[1])

print(classification_report(lyrics_train_test_list[3], lyrics_chain_prediction))

             precision    recall  f1-score   support

          0       0.43      0.09      0.15      1254
          1       0.33      0.04      0.08      1080
          2       0.37      0.09      0.14      1109
          3       0.48      0.26      0.33      2145
          4       0.40      0.07      0.13      1472
          5       0.45      0.08      0.14      1089
          6       0.33      0.02      0.04       878

avg / total       0.41      0.11      0.17      9027



--------------------------------

In [82]:
from sklearn.decomposition import TruncatedSVD

In [89]:
lyrics_chain_svd = Pipeline([ ('tf_idf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=500)),
                         ('clf', ClassifierChain(RandomForestClassifier(class_weight='balanced'), 
                                                 order=[4, 5, 6, 1, 3, 2, 0])) 
                        ])

lyrics_chain_svd.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])

Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...    verbose=0, warm_start=False),
        cv=None, order=[4, 5, 6, 1, 3, 2, 0], random_state=None))])

In [90]:
lyrics_chain_svd_prediction = lyrics_chain_svd.predict(lyrics_train_test_list[1])

print(classification_report(lyrics_train_test_list[3], lyrics_chain_svd_prediction))

             precision    recall  f1-score   support

          0       0.32      0.13      0.18      1254
          1       0.28      0.02      0.04      1080
          2       0.32      0.05      0.08      1109
          3       0.42      0.16      0.23      2145
          4       0.40      0.05      0.09      1472
          5       0.46      0.05      0.09      1089
          6       0.33      0.02      0.03       878

avg / total       0.37      0.08      0.12      9027



In [95]:
from gensim.models import Doc2Vec

In [96]:
doc2vec_model = Doc2Vec.load('./song_lyrics.d2v')

In [109]:
doc2vec_train = doc2vec_model[lyrics_train_test_list[0].index]
doc2vec_test = doc2vec_model[lyrics_train_test_list[1].index]

In [116]:
doc2vec_train.shape

(14651, 100)

In [110]:
doc2vec_rfc = RandomForestClassifier(class_weight='balanced')

doc2vec_rfc.fit(doc2vec_train, lyrics_train_test_list[2])

print(classification_report(lyrics_train_test_list[3], doc2vec_rfc.predict(doc2vec_test)))

             precision    recall  f1-score   support

          0       0.43      0.03      0.06      1254
          1       0.16      0.00      0.01      1080
          2       0.29      0.01      0.02      1109
          3       0.43      0.09      0.15      2145
          4       0.35      0.03      0.06      1472
          5       0.25      0.01      0.02      1089
          6       0.25      0.00      0.01       878

avg / total       0.33      0.04      0.06      9027



In [111]:
doc2vec_chain = ClassifierChain(RandomForestClassifier(class_weight='balanced'),
                                order=[4, 5, 6, 1, 3, 2, 0])

doc2vec_chain.fit(doc2vec_train, lyrics_train_test_list[2])

print(classification_report(lyrics_train_test_list[3], doc2vec_chain.predict(doc2vec_test)))

             precision    recall  f1-score   support

          0       0.27      0.18      0.21      1254
          1       0.31      0.05      0.08      1080
          2       0.32      0.04      0.08      1109
          3       0.42      0.26      0.33      2145
          4       0.42      0.06      0.11      1472
          5       0.35      0.02      0.03      1089
          6       0.45      0.01      0.02       878

avg / total       0.37      0.11      0.15      9027

