Step 1: predict moods:

Moods were consolidated into 9 classes: happy, energetic, sad, trippy, seductive, aggressive, glassy, earthy, and funky

        X1 - audio feature --> Y1 - probabilities of moods
        X2 - lyrics feature --> Y2 - probabilities of moods
        
        Y = mean(Y1, Y2)
        

Step 3: similarity:

    - moods: cosine_similarity 
    - genre: cosine_similarity 
    - audio feature: cosine similarity

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.pipeline import Pipeline
from sklearn.base import clone

import matplotlib.pylab as plt
import seaborn as sns
sns.set_style("whitegrid")

% matplotlib inline

In [2]:
df = pickle.load(open('audio_lyrics_moods.pickle', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'audio_lyrics_moods.pickle'

In [None]:
df.head()

In [None]:
np.random.seed(99)

## Audio

In [None]:
audio_moods = df.drop('lyrics_features', axis = 1)
audio_moods.dropna(how = 'any', inplace = True)
audio_moods.reset_index(drop = True, inplace = True)
audio_moods.head()

In [None]:
mlb_audio = MultiLabelBinarizer()

In [None]:
y_audio = mlb_audio.fit_transform(audio_moods['moods'])
y_audio

In [None]:
y_audio.shape

In [None]:
mlb_audio.classes_

In [None]:
X_1 = audio_moods.drop('moods', axis = 1)
X_1.head()

In [None]:
X_audio = np.array(X_1)
X_audio.shape

In [None]:
audio_train_test_list = train_test_split(X_audio, y_audio, test_size = 0.33)

In [None]:
def optimizing (estimator, param_grid, train_test_list, n_jobs):
    
    output = {}
    X_train, X_test, y_train, y_test = train_test_list
    
    grid = GridSearchCV(estimator, param_grid, refit = True, n_jobs = n_jobs)
    grid.fit(X_train, y_train) 
    
    output['estimator'] = grid.best_estimator_
    output['params'] = grid.best_params_
    output['prediction'] = grid.predict(X_test)
    
    return output


def estimator_searching (init_classifiers, param_grids, train_test_list, n_jobs):
    
    classifiers = {method : {} for method in init_classifiers.keys()}
    
    for method in classifiers.keys():
        
        estimator = init_classifiers [method]
        param_grid = param_grids [method]
        
        classifiers[method] = optimizing(estimator, param_grid, train_test_list, n_jobs)
    
    return classifiers

def show_selected_params (classifiers, train_test_list):
        
    y_true = train_test_list[3]
        
    for model, result in classifiers.items():
        print("    Accuracy in {} model: {}".format(model, accuracy_score(y_true, result['prediction'])))
        print(" ...with selected params: {} \n".format(result['params']))

    print("\n")

In [None]:
audio_init_classifiers = {
        'rfc': Pipeline([ ('scaling', StandardScaler()), ('clf', RandomForestClassifier()) ])
    }

audio_param_grids = {    
    'rfc': 
    {
        'clf__n_estimators': [5, 10, 100],
        'clf__min_samples_split': [2, 3, 4, 5, 10],
        'clf__max_features': ['sqrt', 'log2', 'auto']
    }
}

In [None]:
audio_rfc = estimator_searching(audio_init_classifiers, audio_param_grids, audio_train_test_list, n_jobs=2)

In [None]:
audio_rfc

In [None]:
print(classification_report(audio_train_test_list[3], audio_rfc['rfc']['prediction']))

In [None]:
rfc = RandomForestClassifier(n_estimators= 100, class_weight='balanced')
rfc.fit(audio_train_test_list[0], audio_train_test_list[2])

rfc_prediction = rfc.predict(audio_train_test_list[1])

In [None]:
print(classification_report(audio_train_test_list[3], rfc_prediction))

In [None]:
import pickle

pickle.dump(rfc, open('audio_predict_moods.rfc.pickle', 'wb'))

## Lyrics

In [72]:
lyrics_moods = df.loc[:, ['lyrics_features', 'moods']].copy()
lyrics_moods.dropna(how='any', inplace = True)
lyrics_moods.reset_index(drop = True, inplace = True)
lyrics_moods.head()

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,[energetic]
1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,parti rock yeah woo let s parti rock hous toni...,"[happy, aggressive]"
3,alagamun lan weh wakun heya hanun gon alagamun...,"[energetic, happy]"
4,j lo s new generat mr worldwid parti peopl flo...,[energetic]


In [73]:
mlb_lyrics = MultiLabelBinarizer()

y_lyrics = mlb_lyrics.fit_transform(lyrics_moods['moods'])

In [74]:
y_lyrics.shape

(36733, 9)

In [75]:
lyrics_train_test_list = train_test_split(lyrics_moods['lyrics_features'], y_lyrics, test_size = 0.3)

In [87]:
lyrics_pipeline = Pipeline([ ('tf_idf', TfidfVectorizer()), ('clf', RandomForestClassifier(n_estimators=100)) ])

In [88]:
lyrics_pipeline.fit(lyrics_train_test_list[0], lyrics_train_test_list[2])

Pipeline(memory=None,
     steps=[('tf_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [89]:
prediction = lyrics_pipeline.predict(lyrics_train_test_list[1])

In [91]:
print(classification_report(lyrics_train_test_list[3], prediction))

             precision    recall  f1-score   support

          0       0.28      0.01      0.01      1715
          1       0.35      0.01      0.02       584
          2       0.56      0.03      0.06      3446
          3       0.61      0.06      0.11      3083
          4       0.44      0.01      0.03      1075
          5       0.47      0.02      0.03      1473
          6       0.39      0.01      0.02      2648
          7       0.34      0.01      0.03      1487
          8       0.36      0.01      0.02      2144

avg / total       0.45      0.02      0.04     17655



In [81]:
from sklearn.metrics import jaccard_similarity_score

In [100]:
print(jaccard_similarity_score(lyrics_train_test_list[3][66], prediction[66]))

0.777777777778


In [96]:
pickle.dump(lyrics_pipeline, open('lyrics_predict_moods.rfc.pickle', 'wb'))

In [98]:
print(jaccard_similarity_score(['happy', 'aggressive'], ['trippy', 'aggressive']))

0.5


In [99]:
print(jaccard_similarity_score(['happy', 'aggressive'], ['trippy', 'happy']))

0.0
