Predict genres using audio features and lyrics
==============================================

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_song_list = pd.read_json('../MasterSongList.json')
df_song_list['genres'] = df_song_list['genres'].apply(''.join)
df_song_list['genres'] = df_song_list['genres'].map(lambda x: x.split(':')[0] if len(x) > 0 else np.nan)
df_song_list.loc[:, 'moods'] = df_song_list.loc[:, 'moods'].apply(lambda x: x if len(x) > 0 else np.nan)
df_moods = df_song_list.loc[:, 'moods'].str.join(',').str.get_dummies(sep=',')
df_song_list.loc[df_song_list['genres'] == 'blues & blues rock', 'genres'] = 'blues'

In [3]:
feature_col_names = [
                        'key',
                        'energy',
                        'liveliness',
                        'tempo',
                        'speechiness',
                        'acousticness',
                        'instrumentalness',
                        'time_signature',
                        'duration',
                        'loudness',
                        'valence',
                        'danceability',
                        'mode',
                        'time_signature_confidence',
                        'tempo_confidence',
                        'key_confidence',
                        'mode_confidence'
                    ]
df_audio_features = pd.DataFrame(df_song_list.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
df_audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [4]:
df_song_list_filtered = df_song_list.groupby('genres').filter(lambda x: len(x) > 900)
df_song_list_filtered.reset_index(inplace=True, drop=True)
df_audio_features_filtered = pd.DataFrame(df_song_list_filtered.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
df_moods_filtered = df_song_list_filtered.loc[:, 'moods'].str.join(',').str.get_dummies(sep=',')
df_audio_features_moods_filtered = pd.concat([df_audio_features_filtered, df_moods_filtered, df_song_list_filtered['genres']], axis=1)
df_audio_features_genres_filtered = pd.concat([df_audio_features_filtered, df_song_list_filtered['genres']], axis=1)
df_song_list_filtered.head()
df_audio_features_filtered.head()
# df_song_list_filtered.shape
# df_audio_features_filtered.shape
df_moods_filtered.head()
df_audio_features_moods_filtered.head()
df_audio_features_genres_filtered.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
3,8.0,0.777375,0.054104,104.946,0.029302,0.13035,0.0,1.0,4.0,228.29333,-5.112,0.525632,0.729051,0.817,0.672,0.394,0.963,r&b
4,7.0,0.585564,0.108297,120.014,0.038924,0.011707,5e-06,1.0,4.0,193.57333,-6.583,0.622176,0.781822,0.97,0.861,0.792,1.0,pop


In [5]:
df_audio_features_moods = pd.concat([df_audio_features, df_moods, df_song_list['genres']], axis=1)
df_audio_features_moods.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,...,sexual,soothing,spacey,sprightly,sweet,trashy,trippy,visceral,warm,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,...,0,0,0,0,0,0,0,0,0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,...,0,0,0,0,0,0,0,0,0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,...,0,0,0,0,0,0,0,0,0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,...,0,0,0,0,0,0,0,0,0,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,...,0,0,0,0,0,0,0,0,0,reggaeton


In [6]:
df_audio_features_genres = pd.concat([df_audio_features, df_song_list['genres']], axis=1)
df_audio_features_genres.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


## audio_features + lyrics_features + genres

In [7]:
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

def clean_text(text):
    processing_text = text.lower()
    translate_func = str.maketrans('', '', punctuation)
    processing_text = processing_text.translate(translate_func)
    stemmer = SnowballStemmer('english')
    clean_token = [ stemmer.stem(word) for word in processing_text.split() if word not in ENGLISH_STOP_WORDS ]
    processing_text = ' '.join(clean_token)
    return processing_text

In [8]:
clean_text_lambda = lambda x: clean_text(' '.join(x)) if len(x) > 0 else np.nan
lyrics_features = df_song_list.loc[:, 'lyrics_features'].apply(clean_text_lambda)

In [9]:
df_audio_features_lyrics_genres = pd.concat([df_song_list.genres, lyrics_features, df_audio_features], axis = 1)
df_audio_features_lyrics_genres.head()

Unnamed: 0,genres,lyrics_features,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,pop,oppa gangnam style gangnam style najeneun ttas...,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0
1,pop,late ve ve lose sleep dream thing babi ve ve p...,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0
2,,parti rock yeah woo let s parti rock hous toni...,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0
3,dance,alagamun lan weh wakun heya hanun gon alagamun...,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742
4,reggaeton,j lo s new generat mr worldwid parti peopl flo...,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0


In [10]:
df_audio_features_lyrics_genres.dropna().genres.value_counts()

rock                       4816
rap                        1860
r&b                        1778
indie                      1438
dance                       919
singer-songwriter           863
country                     788
pop                         638
jazz                        525
latin                       480
electronica                 386
blues                       301
oldies                      294
reggae & ska                245
funk                        245
folk                        184
international/world         159
children's                  122
christian                   120
int'l                       108
reggaeton                    88
dubstep & drum 'n' bass      86
showtunes                    71
bluegrass                    45
classical                    42
easy listening               31
film scores                  17
Name: genres, dtype: int64

In [11]:
lyrics_features.isna().value_counts()

False    20931
True     15802
Name: lyrics_features, dtype: int64

## audio_features + lyrics_features + moods

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def build_genres_model(X, y):
    knn = KNeighborsClassifier()
    lg  = LogisticRegression()
    svm = SVC(probability=True)
    nb  = GaussianNB()
    clfs = [
        ('knn', knn),
        ('lg' , lg),
        ('svm', svm),
        ('nb' , nb)
    ]
    
    voting_cls = VotingClassifier(clfs, voting='soft', n_jobs=4)
    scaler = StandardScaler()
    
    steps = [
        ('scaler', scaler),
        ('voting_cls',voting_cls)
    ]
    return Pipeline(steps=steps)
    
def train_genres_model(X, y):
    pass
def test_genres_model(X, y):
    pass

In [13]:
df_song_list.head()

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],pop,"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],pop,"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],,"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436
3,{'$oid': '52fdfb410b9398049f3d1eac'},Gentleman,PSY,"[3, 0.705822, 0.053292, 126.009, 0.126016, 0.0...","[party, party, party, party, party, party]",[2010s],dance,"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]",Gentleman,energetic,http://images.musicnet.com/albums/082/950/461/...,12353.0,"[driving in the left lane, energy boost, girls...",ASO_zypdnsQ,892096527
4,{'$oid': '52fdfb400b9398049f3d0b19'},On The Floor,Jennifer Lopez,"[3, 0.741757, 0.07277399999999999, 129.985, 0....","[party, party]",[2000s],reggaeton,"[j, lo, the, other, side, out, my, mine, it, s...",[energetic],On The Floor,work out,http://images.musicnet.com/albums/050/131/765/...,29502.0,"[working out: cardio, dance party: sweaty]",t4H_Zoh7G5A,873285189


## Load Doc2Vec model

In [9]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec as w

In [10]:
model = w.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

In [5]:
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
def clean_text(raw_text):
    processing_text = raw_text.lower()
    tran = str.maketrans('', '', punctuation)
    processing_text = processing_text.translate(tran)
    stemmer = SnowballStemmer('english')
    clean_words = [ stemmer.stem(word) for word in processing_text.split() if word not in ENGLISH_STOP_WORDS ]
    return ' '.join(clean_words)

In [13]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [14]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

## Model

### Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.base import TransformerMixin

## ================================

In [10]:
def extract_lyric_features(X):
#     print(X.lyrics_features)
    return X['lyrics_features']

def extract_audio_features(X):
    return X.drop('lyrics_features', axis=1)

get_lyric_features = FunctionTransformer(extract_lyric_features, validate=False)
get_audio_features = FunctionTransformer(extract_audio_features, validate=False)

```python
process_and_join_features = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('vec', CountVectorizer())
            ]))
         ])),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])
```

In [22]:
step_audio_lyric_tfidf_SVC = [
    ('Features', FeatureUnion([
        ('lyric_features', Pipeline([
            ('selector', get_lyric_features),
            ('vect', TfidfVectorizer()),
            ('pca', TruncatedSVD(n_components=17))
        ])),
        ('audio_features', Pipeline([
            ('selector', get_audio_features)
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('clf', SVC())
]
pipe_tfidf_svc = Pipeline(step_audio_lyric_tfidf_SVC)

In [23]:
from xgboost import XGBClassifier
step_audio_lyric_tfidf_XGB = [
    ('Feature_Union', FeatureUnion([
        ('lyric_features', Pipeline([
            ('selector', get_lyric_features),
            ('vect', TfidfVectorizer())
        ])),
        ('audio_features', Pipeline([
            ('selector', get_audio_features)
        ]))
    ])),
    ('clf', XGBClassifier())
]
pipe_tfidf = Pipeline(step_audio_lyric_tfidf_XGB)

In [24]:
step_audio_lyric_tfidf_2_XGB = [
    ('Feature_Union', FeatureUnion([
        ('lyric_features', Pipeline([
            ('selector', get_lyric_features),
            ('vect', TfidfVectorizer()),
            ('pca', TruncatedSVD(n_components=17)),
        ])),
        ('audio_features', Pipeline([
            ('selector', get_audio_features)
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier())
]
pipe_tfidf_2 = Pipeline(step_audio_lyric_tfidf_2_XGB)

In [21]:
from sklearn.model_selection import train_test_split
df_audio_features_lyrics_genres_dropna = df_audio_features_lyrics_genres.dropna()
X = df_audio_features_lyrics_genres_dropna.drop('genres',axis=1)
y = df_audio_features_lyrics_genres_dropna.genres
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [28]:
pipe_tfidf.fit(X_train, y_train)
print(classification_report(y_test, pipe_tfidf.predict(X_test)))

                         precision    recall  f1-score   support

              bluegrass       1.00      0.14      0.25         7
                  blues       0.53      0.32      0.40        91
             children's       0.00      0.00      0.00        40
              christian       0.62      0.14      0.23        35
              classical       0.44      0.44      0.44         9
                country       0.56      0.30      0.39       264
                  dance       0.47      0.40      0.43       266
dubstep & drum 'n' bass       0.00      0.00      0.00        23
         easy listening       1.00      0.10      0.18        10
            electronica       0.30      0.06      0.10       126
            film scores       0.00      0.00      0.00         6
                   folk       0.20      0.02      0.04        47
                   funk       0.33      0.15      0.20        75
                  indie       0.38      0.18      0.24       418
                  int'l 

  if diff:
  'precision', 'predicted', average, warn_for)


In [34]:
pipe_tfidf_2.fit(X_train, y_train)
print(classification_report(y_test, pipe_tfidf_2.predict(X_test)))

                         precision    recall  f1-score   support

              bluegrass       0.00      0.00      0.00         7
                  blues       0.44      0.35      0.39        91
             children's       0.00      0.00      0.00        40
              christian       0.80      0.11      0.20        35
              classical       0.33      0.22      0.27         9
                country       0.49      0.27      0.35       264
                  dance       0.46      0.38      0.42       266
dubstep & drum 'n' bass       0.50      0.04      0.08        23
         easy listening       0.33      0.10      0.15        10
            electronica       0.38      0.07      0.12       126
            film scores       0.00      0.00      0.00         6
                   folk       1.00      0.02      0.04        47
                   funk       0.47      0.19      0.27        75
                  indie       0.30      0.16      0.21       418
                  int'l 

  if diff:
  'precision', 'predicted', average, warn_for)


In [32]:
pipe_bow.fit(X_train, y_train)
print(classification_report(y_test, pipe_bow.predict(X_test)))

                         precision    recall  f1-score   support

              bluegrass       0.00      0.00      0.00         7
                  blues       0.00      0.00      0.00        91
             children's       0.00      0.00      0.00        40
              christian       0.00      0.00      0.00        35
              classical       0.00      0.00      0.00         9
                country       0.00      0.00      0.00       264
                  dance       0.52      0.12      0.20       266
dubstep & drum 'n' bass       0.00      0.00      0.00        23
         easy listening       0.00      0.00      0.00        10
            electronica       0.00      0.00      0.00       126
            film scores       0.00      0.00      0.00         6
                   folk       0.00      0.00      0.00        47
                   funk       0.00      0.00      0.00        75
                  indie       0.00      0.00      0.00       418
                  int'l 

  'precision', 'predicted', average, warn_for)


In [106]:
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [29]:
step_audio_lyric_tfidf = [
    ('Feature_Union', FeatureUnion([
        ('lyric_features', Pipeline([
            ('selector', get_lyric_features),
            ('vect', TfidfVectorizer()),
            ('pca', TruncatedSVD(n_components=17)),
        ])),
        ('audio_features', Pipeline([
            ('selector', get_audio_features)
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('clf', SVC())
]
pipe_tfidf_svc_2 = Pipeline(step_audio_lyric_tfidf)
pipe_tfidf_svc_2.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('Feature_Union', FeatureUnion(n_jobs=1,
       transformer_list=[('lyric_features', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function extract_lyric_features at 0x7f7b652daa60>,
          inv_kw_args=None, inverse_func=None, kw_ar...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [30]:
print(classification_report(y_test, pipe_tfidf_svc_2.predict(X_test)))

             precision    recall  f1-score   support

  classical       0.00      0.00      0.00         9
    country       0.25      0.01      0.02       239
      dance       0.52      0.11      0.17       276
electronica       0.00      0.00      0.00       117
      indie       0.00      0.00      0.00       445
       jazz       0.47      0.44      0.45       149
      latin       0.75      0.64      0.69       136
        pop       0.00      0.00      0.00       174
        r&b       0.50      0.45      0.48       536
        rap       0.80      0.78      0.79       564
       rock       0.50      0.94      0.65      1456

avg / total       0.44      0.55      0.45      4101



  'precision', 'predicted', average, warn_for)


In [119]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def build_genres_model(X=None, y=None):
    knn = KNeighborsClassifier()
    lg  = LogisticRegression()
    svm = SVC(probability=True)
    nb  = GaussianNB()
    clfs = [
        ('knn', knn),
        ('lg' , lg),
        ('svm', svm),
        ('nb' , nb)
    ]
    
    voting_cls = VotingClassifier(clfs, voting='soft', n_jobs=4)
    scaler = StandardScaler()
    
    steps = [
        ('scaler', scaler),
        ('voting_cls',voting_cls)
    ]
    return Pipeline(steps=steps)
    
def train_genres_model(X, y):
    pass
def test_genres_model(X, y):
    pass

In [118]:
voting_model = build_genres_model(X_train.drop('lyrics_features', axis=1), y_train)
voting_model.fit(X_train.drop('lyrics_features', axis=1), y_train)
print(classification_report(y_test, voting_model.predict(X_test.drop('lyrics_features', axis=1))))

                         precision    recall  f1-score   support

              bluegrass       0.00      0.00      0.00         7
                  blues       0.14      0.02      0.04        91
             children's       0.00      0.00      0.00        40
              christian       0.33      0.03      0.05        35
              classical       0.11      0.44      0.17         9
                country       0.31      0.24      0.27       264
                  dance       0.34      0.38      0.36       266
dubstep & drum 'n' bass       0.00      0.00      0.00        23
         easy listening       1.00      0.10      0.18        10
            electronica       0.24      0.06      0.09       126
            film scores       0.00      0.33      0.01         6
                   folk       0.00      0.00      0.00        47
                   funk       0.35      0.08      0.13        75
                  indie       0.24      0.03      0.06       418
                  int'l 

  if diff:
  'precision', 'predicted', average, warn_for)


# New dataset

In [2]:
import pickle
my_database = pd.read_pickle('my_database_new.pickle')
my_database.heaad()

Unnamed: 0,artist,name,audio_features,lyrics_features,genres,moods
0,Merle Haggard,Workin' Man Blues,"[2, 0.419332, 0.031391999999999996, 105.352, 0...","[it, s, a, big, job, just, gettin, by, with, n...",country,[earthy]
1,DJ Center,Yes! (Featuring Zaki Ibrahim),"[5, 0.548063, 0.10377199999999999, 92.022, 0.0...",[],rap,"[happy, sad]"
2,Devendra Banhart,Freely,"[4, 0.196956, 0.10985099999999999, 140.55, 0.0...","[it, ain, t, about, a, heart, to, find, it, s,...",indie,[sad]
3,Bobby Vee,Sharing You,"[8, 0.356188, 0.169035, 110.14, 0.030354, 0.70...","[peak, billboard, position, 15, in, 1962, word...",rock,[earthy]
4,Bone Thugs-N-Harmony,Hardtimes,"[10, 0.5808869999999999, 0.13563, 140.22, 0.02...","[yeah, i, m, just, tryin, to, prepare, myself,...",rap,"[happy, sad]"


In [3]:
feature_col_names = [
                        'key',
                        'energy',
                        'liveliness',
                        'tempo',
                        'speechiness',
                        'acousticness',
                        'instrumentalness',
                        'time_signature',
                        'duration',
                        'loudness',
                        'valence',
                        'danceability',
                        'mode',
                        'time_signature_confidence',
                        'tempo_confidence',
                        'key_confidence',
                        'mode_confidence'
                    ]
n_df_audio_features = pd.DataFrame(my_database.loc[:, 'audio_features'].tolist(), columns=feature_col_names)
n_df_audio_features_genres = pd.concat([my_database['genres'], n_df_audio_features], axis=1)
n_df_audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,2.0,0.419332,0.031392,105.352,0.033737,0.286198,0.015986,1.0,4.0,162.12,-16.718,0.844026,0.833461,0.541,0.64,0.848,1.0
1,5.0,0.548063,0.103772,92.022,0.040763,0.146351,0.081452,0.0,4.0,259.42667,-6.125,0.580668,0.755222,0.285,0.462,0.94,0.99
2,4.0,0.196956,0.109851,140.55,0.041798,0.677967,0.496227,1.0,4.0,298.53333,-18.621,0.168069,0.338994,0.333,0.462,0.017,0.849
3,8.0,0.356188,0.169035,110.14,0.030354,0.707248,6e-06,1.0,4.0,123.10667,-13.528,0.439992,0.503739,0.272,0.294,0.255,1.0
4,10.0,0.580887,0.13563,140.22,0.027578,0.258097,0.40578,1.0,4.0,169.4,-6.654,0.791455,0.735931,0.817,0.377,0.456,1.0


In [6]:
n_df_audio_features_genres = pd.concat([my_database['genres'], n_df_audio_features], axis=1)
clean_text_lambda = lambda x: clean_text(' '.join(x)) if len(x) > 0 else np.nan
n_df_lyrics_features = my_database.loc[:, 'lyrics_features'].apply(clean_text_lambda)

In [7]:
n_df_audio_features_lyrics_genres = pd.concat([my_database.genres, n_df_lyrics_features, n_df_audio_features], axis = 1)
n_df_audio_features_lyrics_genres.head()

Unnamed: 0,genres,lyrics_features,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,country,s big job just gettin kid wife workin man dang...,2.0,0.419332,0.031392,105.352,0.033737,0.286198,0.015986,1.0,4.0,162.12,-16.718,0.844026,0.833461,0.541,0.64,0.848,1.0
1,rap,,5.0,0.548063,0.103772,92.022,0.040763,0.146351,0.081452,0.0,4.0,259.42667,-6.125,0.580668,0.755222,0.285,0.462,0.94,0.99
2,indie,ain t heart s insid s love hide s waitin let o...,4.0,0.196956,0.109851,140.55,0.041798,0.677967,0.496227,1.0,4.0,298.53333,-18.621,0.168069,0.338994,0.333,0.462,0.017,0.849
3,rock,peak billboard posit 15 1962 word music gerri ...,8.0,0.356188,0.169035,110.14,0.030354,0.707248,6e-06,1.0,4.0,123.10667,-13.528,0.439992,0.503739,0.272,0.294,0.255,1.0
4,rap,yeah m just tryin prepar ya know m sayin war c...,10.0,0.580887,0.13563,140.22,0.027578,0.258097,0.40578,1.0,4.0,169.4,-6.654,0.791455,0.735931,0.817,0.377,0.456,1.0


In [20]:
from sklearn.model_selection import train_test_split
df_audio_features_lyrics_genres_dropna = n_df_audio_features_lyrics_genres.dropna()
X = df_audio_features_lyrics_genres_dropna.drop('genres',axis=1)
y = df_audio_features_lyrics_genres_dropna.genres
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [21]:
y.value_counts()

rock           4816
rap            1860
r&b            1778
indie          1438
dance           919
country         788
pop             638
jazz            525
latin           480
electronica     386
classical        42
Name: genres, dtype: int64

## tfidf - SVC

In [25]:
pipe_tfidf_svc.fit(X_train, y_train)
print(classification_report(y_test, pipe_tfidf_svc.predict(X_test)))

             precision    recall  f1-score   support

  classical       0.00      0.00      0.00         9
    country       0.20      0.01      0.02       239
      dance       0.56      0.10      0.17       276
electronica       0.00      0.00      0.00       117
      indie       0.00      0.00      0.00       445
       jazz       0.46      0.44      0.45       149
      latin       0.75      0.64      0.69       136
        pop       0.00      0.00      0.00       174
        r&b       0.50      0.45      0.47       536
        rap       0.80      0.79      0.79       564
       rock       0.50      0.94      0.65      1456

avg / total       0.44      0.54      0.45      4101



  'precision', 'predicted', average, warn_for)


## tfidf - XGBoost

In [26]:
pipe_tfidf.fit(X_train, y_train)
print(classification_report(y_test, pipe_tfidf.predict(X_test)))

             precision    recall  f1-score   support

  classical       1.00      0.11      0.20         9
    country       0.61      0.32      0.42       239
      dance       0.52      0.44      0.48       276
electronica       0.32      0.05      0.09       117
      indie       0.41      0.15      0.22       445
       jazz       0.49      0.54      0.51       149
      latin       0.75      0.79      0.77       136
        pop       0.59      0.06      0.10       174
        r&b       0.59      0.56      0.57       536
        rap       0.82      0.84      0.83       564
       rock       0.59      0.87      0.71      1456

avg / total       0.59      0.61      0.57      4101



  if diff:


## tfidf - XGBoost (scaled)

In [27]:
pipe_tfidf_2.fit(X_train, y_train)
print(classification_report(y_test, pipe_tfidf_2.predict(X_test)))

             precision    recall  f1-score   support

  classical       0.67      0.22      0.33         9
    country       0.49      0.28      0.36       239
      dance       0.50      0.39      0.44       276
electronica       0.32      0.07      0.11       117
      indie       0.43      0.19      0.26       445
       jazz       0.52      0.54      0.53       149
      latin       0.73      0.76      0.74       136
        pop       0.33      0.03      0.06       174
        r&b       0.52      0.54      0.53       536
        rap       0.80      0.82      0.81       564
       rock       0.60      0.86      0.71      1456

avg / total       0.57      0.60      0.56      4101



  if diff:


In [28]:
pickle.dump(pipe_tfidf, open('pipe_tfidf_xgb.pkl', 'wb'))