## lyrics_classifier

In [1]:
# Import dependencies
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

import json
import re
import string

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.multiclass import OneVsRestClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

In [2]:
# Read lyrics data
lyrics = pd.read_csv('.\\data\\lyrics.csv', encoding='unicode_escape')
lyrics['artist_name'] = lyrics.apply(lambda x: x['artist_name'].lower(), axis=1)
lyrics['original_line'] = lyrics['line']

### Remove misclassified artists

In [3]:
# Import genre artist lists
file = open('.\\data\\json_genres.json')
genres = json.load(file)
file.close()

In [4]:
# Check whether artists in a given genre are (potentially) misclassified
def artistCheck(genre):
    misclasses = []
    artists = lyrics[lyrics.genre==genre].artist_name.unique()
    for artist in artists:
        if artist not in genres[genre]:
            misclasses.append(artist)
    return(misclasses)

In [5]:
# Collect misclassified artists of every genre
misclass_dict = {}
for genre in lyrics.genre.unique():
    misclass_dict[genre] = artistCheck(genre)
misclass_dict

{'country': ['tyler, the creator',
  'george jones & tammy wynette',
  'hank williams jr.'],
 'metal': ['queensrÿche', 'a$ap rocky', 'motörhead', "guns n' roses"],
 'pop': ['andrea bocelli & céline dion', 'drake'],
 'rap': ['a$ap rocky', 'royce da 5\x929\x94', 'j.i.d'],
 'rock': ['florence + the machine', 'hieroglyphics', 'wild belle'],
 'soul': ['sam smith', 'sir sly', 'telly', 'prince ea']}

In [6]:
# Remove all actual misclasses
lyrics = lyrics[~((lyrics.genre=='country') & (lyrics.artist_name=='tyler, the creator'))]
lyrics = lyrics[~((lyrics.genre=='metal') & (lyrics.artist_name=='a$ap rocky'))]
lyrics = lyrics[~((lyrics.genre=='pop') & (lyrics.artist_name=='drake'))]
lyrics = lyrics[~((lyrics.genre=='rock') & (lyrics.artist_name=='hieroglyphics'))]
lyrics = lyrics[~((lyrics.genre=='rock') & (lyrics.artist_name=='wild belle'))]
lyrics = lyrics[~((lyrics.genre=='soul') & (lyrics.artist_name=='prince ea'))]
lyrics = lyrics[~((lyrics.genre=='soul') & (lyrics.artist_name=='sam smith'))]
lyrics = lyrics[~((lyrics.genre=='soul') & (lyrics.artist_name=='sir sly'))]
lyrics = lyrics[~((lyrics.genre=='soul') & (lyrics.artist_name=='telly'))]
lyrics = lyrics.reset_index(drop=True)

In [7]:
# Remove artists that "blur the line" b/t genres [manual decision]
removes_country = []
removes_metal = []
removes_pop = ['chris brown', 'nickelback', 'sheryl crow', 'taylor swift', 'matchbox twenty']
removes_rap = []
removes_rock = []
removes_soul = []
removes = removes_country + removes_metal + removes_pop + removes_rap + removes_rock + removes_soul

lyrics = lyrics[~lyrics.artist_name.isin(removes)]

### Text preprocessing

In [8]:
# Process text for classification modeling
def preprocessText(text, remove_stops=False):
    
    # Remove everything between hard brackets
    text = re.sub(pattern="\[.+?\]( )?", repl='', string=text)

    # Change "walkin'" to "walking", for example
    text = re.sub(pattern="n\\\' ", repl='ng ', string=text)

    # Remove x4 and (x4), for example
    text = re.sub(pattern="(\()?x\d+(\))?", repl=' ', string=text)

    # Fix apostrophe issues
    text = re.sub(pattern="\\x91", repl="'", string=text)
    text = re.sub(pattern="\\x92", repl="'", string=text)
    text = re.sub(pattern="<u\+0092>", repl="'", string=text)
    
    # Make lowercase
    text = text.lower()
    
    # Special cases/words
    text = re.sub(pattern="'til", repl="til", string=text)
    text = re.sub(pattern="'til", repl="til", string=text)
    text = re.sub(pattern="gon'", repl="gon", string=text)

    # Remove \n from beginning
    text = re.sub(pattern='^\n', repl='', string=text)

    # Strip , ! ?, : and remaining \n from lyrics
    text = ''.join([char.strip(",!?:") for char in text])
    text = text.replace('\n', ' ')

    # Remove contractions
    # specific
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"let\’s", "let us", text)
    text = re.sub(r"ain't", "aint", text)
    text = re.sub(r"ain\’t", "aint", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\’s", " is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)

    # Remove remaining punctuation
    punc = string.punctuation
    text = ''.join([char for char in text if char not in punc])

    # Remove stopwords
    if remove_stops:
        stops = stopwords.words('english')
        text = ' '.join([word for word in text.split(' ') if word not in stops])
    
    # Remove double spaces and beginning/trailing whitespace
    text = re.sub(pattern='( ){2,}', repl=' ', string=text)
    text = text.strip()
    
    return(text)

In [9]:
lyrics['line'] = lyrics.apply(lambda x: preprocessText(x['line']), axis=1)

### Stem words

In [10]:
stemmer = EnglishStemmer()
lyrics['line_stems'] = lyrics.apply(lambda x: ' '.join([stemmer.stem(word) for word in x['line'].split(' ')]), axis=1)
lyrics['unique_stems'] = lyrics.apply(lambda x: ' '.join([stemmer.stem(word) for word in x['unique_words']]), axis=1)

### Genre Classifier Models

In [11]:
# Combine metal & rock
lyrics['genre'] = lyrics.apply(lambda x: 'metal/rock' if x['genre'] in ['metal', 'rock'] else x['genre'], axis=1)

# Train/test split
train, test = train_test_split(lyrics, test_size=0.2, stratify=lyrics.genre, random_state=1)

#### Naive Bayes

In [12]:
# Naive Bayes model w/ k-fold CV
text_mnb = Pipeline([('vect', CountVectorizer()),
                     ('mnb', MultinomialNB(fit_prior=False))])
text_mnb = text_mnb.fit(train.line_stems, train.genre)
cross_val_score(estimator=text_mnb, X=train.line_stems, y=train.genre, cv=7).mean()

0.7063501162374193

In [13]:
# Naive Bayes test results
print(text_mnb.score(y=test.genre, X=test.line_stems))
preds = text_mnb.predict(test.line_stems)
print(classification_report(y_pred=preds, y_true=test.genre))
pd.crosstab(preds, test.genre)

0.7082847141190198
             precision    recall  f1-score   support

    country       0.60      0.66      0.63       296
 metal/rock       0.77      0.72      0.75       595
        pop       0.55      0.57      0.56       231
        rap       0.88      0.91      0.90       303
       soul       0.64      0.63      0.64       289

avg / total       0.71      0.71      0.71      1714



genre,country,metal/rock,pop,rap,soul
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
country,195,60,23,7,38
metal/rock,51,427,34,7,32
pop,20,44,132,11,33
rap,4,15,14,277,3
soul,26,49,28,1,183


#### SVM

In [14]:
# SVM model with k-fold CV
text_svm = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,2))),
                     ('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4,
                                           n_iter=25, random_state=123))])
text_svm = text_svm.fit(train.line_stems, train.genre)
cross_val_score(estimator=text_svm, X=train.line, y=train.genre, cv=7).mean()

0.7302637115804914

In [15]:
# SVM test results
print(text_svm.score(y=test.genre, X=test.line_stems))
preds_svm = text_svm.predict(test.line_stems)
print(classification_report(y_pred=preds_svm, y_true=test.genre))
pd.crosstab(preds_svm, test.genre)

0.7409568261376897
             precision    recall  f1-score   support

    country       0.68      0.66      0.67       296
 metal/rock       0.73      0.82      0.78       595
        pop       0.69      0.48      0.56       231
        rap       0.92      0.95      0.93       303
       soul       0.65      0.65      0.65       289

avg / total       0.74      0.74      0.74      1714



genre,country,metal/rock,pop,rap,soul
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
country,196,38,14,2,37
metal/rock,59,490,58,8,52
pop,9,23,110,4,13
rap,2,9,15,287,0
soul,30,35,34,2,187


In [16]:
# Which songs did the svm model misclassify?
test['preds_svm'] = preds_svm
misclasses_svm = test[test.preds_svm != test.genre]
misclasses_svm['misclass_combo'] = misclasses_svm.apply(lambda x: x['genre']+'-'+x['preds_svm'], axis=1)

In [17]:
misclasses_svm.misclass_combo.value_counts()

country-metal/rock    59
pop-metal/rock        58
soul-metal/rock       52
metal/rock-country    38
soul-country          37
metal/rock-soul       35
pop-soul              34
country-soul          30
metal/rock-pop        23
pop-rap               15
pop-country           14
soul-pop              13
metal/rock-rap         9
country-pop            9
rap-metal/rock         8
rap-pop                4
rap-soul               2
rap-country            2
country-rap            2
Name: misclass_combo, dtype: int64

In [18]:
misclasses_svm[misclasses_svm.misclass_combo=='country-rap']

Unnamed: 0,line,song_id,song_name,artist_id,artist_name,characters,genre,original_line,unique_words,num_unique_words,line_stems,unique_stems,preds_svm,misclass_combo
656,splish splash i was taking a bath long about a...,1728080,Splish Splash,223639,conway twitty,1274,country,"Splish Splash, I was takin' a bath\nLong about...","[a, about, agroovin, agroovinwoo, all, alright...",107,splish splash i was take a bath long about a s...,a about agroovin agroovinwoo all alright amov ...,rap,country-rap
1411,gimme some straight talk straight talk and hol...,208250,Straight Talk,20955,dolly parton,1859,country,"[Chorus]\nGimme some straight talk, straight t...","[a, about, all, alone, alright, am, america, a...",129,gimm some straight talk straight talk and hold...,a about all alon alright am america an and ans...,rap,country-rap


In [19]:
# Re-train model on full data set
full_text_svm = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,2))),
                          ('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-4, 
                                                n_iter=20, random_state=123))])
full_text_svm = full_text_svm.fit(lyrics.line, lyrics.genre)

#### XGBoost

In [20]:
# XGB model
vect = TfidfVectorizer(ngram_range=(1,2))
vect.fit_transform(train.line_stems)
vect_test = vect.transform(pd.Series(test.line_stems))

In [21]:
xgb = XGBClassifier(learning_rate=0.25, subsample=0.8, gamma=1, random_state=123, max_depth=4, max_delta_step=1).fit(vect_train, train.genre)

In [22]:
print(xgb.score(y=test.genre, X=vect_test))
preds_xgb = xgb.predict(vect_test)
print(classification_report(y_pred=preds_xgb, y_true=test.genre))
pd.crosstab(xgb.predict(vect_test), test.genre)

0.7666277712952159
             precision    recall  f1-score   support

    country       0.71      0.62      0.67       296
 metal/rock       0.73      0.86      0.79       595
        pop       0.78      0.58      0.67       231
        rap       0.96      0.95      0.96       303
       soul       0.70      0.67      0.69       289

avg / total       0.77      0.77      0.76      1714



genre,country,metal/rock,pop,rap,soul
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
country,185,34,8,4,29
metal/rock,79,513,58,4,53
pop,6,12,135,6,14
rap,0,1,11,288,0
soul,26,35,19,1,193


In [23]:
# Which songs did the xgb model misclassify?
test['preds_xgb'] = preds_xgb
misclasses_xgb = test[test.preds_xgb != test.genre]
misclasses_xgb['misclass_combo'] = misclasses_xgb.apply(lambda x: x['genre']+'-'+x['preds_xgb'], axis=1)

In [24]:
misclasses_xgb.misclass_combo.value_counts()

country-metal/rock    79
pop-metal/rock        58
soul-metal/rock       53
metal/rock-soul       35
metal/rock-country    34
soul-country          29
country-soul          26
pop-soul              19
soul-pop              14
metal/rock-pop        12
pop-rap               11
pop-country            8
country-pop            6
rap-pop                6
rap-country            4
rap-metal/rock         4
rap-soul               1
metal/rock-rap         1
Name: misclass_combo, dtype: int64

In [25]:
misclasses_xgb[misclasses_xgb.misclass_combo=='metal/rock-rap'].sort_values('artist_name')

Unnamed: 0,line,song_id,song_name,artist_id,artist_name,characters,genre,original_line,unique_words,num_unique_words,line_stems,unique_stems,preds_svm,preds_xgb,misclass_combo
7108,this time i came to get mine i saw this cat ru...,338348,Liquid Diet,424,papa roach,1689,metal/rock,"[Chorus 1]\nThis time, I came to get mine\nI s...","[a, about, absurd, abused, am, and, as, back, ...",174,this time i came to get mine i saw this cat ru...,a about absurd abus am and as back bag bang be...,rap,rap,metal/rock-rap


#### Deploy SVM model on new text

In [26]:
# Process and predict genre of text (SVM)
def svmLyricClassifier(text):
    text = preprocessText(text)
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
    print(full_text_svm.predict([text]))

In [27]:
# Zac Brown Band - Chicken Fried
svmLyricClassifier(
'''
Well I was raised up beneath the shade of a Georgia Pine
And that's home you know
Sweet tea, pecan pie, and homemade wine where the peaches grow
And my house it's not much to talk about
But it's filled with love that's grown in southern ground
''')

['country']


In [28]:
# Nickelback - Photograph
svmLyricClassifier(
'''
Look at this photograph
Every time I do, it makes me laugh
How did our eyes get so red?
And what the hell is on Joey's head?
And this is where I grew up
I think the present owner fixed it up
I never knew we ever went without
The second floor is hard for sneaking out
''')

['metal/rock']


In [29]:
# One Direction - What Makes You Beautiful
svmLyricClassifier(
'''
Baby, you light up my world like nobody else
The way that you flip your hair gets me overwhelmed
But when you smile at the ground, it ain't hard to tell
You don't know, oh, oh, you don't know you're beautiful
If only you saw what I can see
You'll understand why I want you so desperately
Right now I'm looking at you and I can't believe
You don't know, you don't know you're beautiful
That's what makes you beautiful
''')

['pop']


In [30]:
# Wu-Tang Clan - C.R.E.A.M.
svmLyricClassifier(
'''
I grew up on the crime side, the New York Times side
Stayin' alive was no jive
Had secondhands, Mom's bounced on old man
So then we moved to Shaolin land
A young youth, yo, rockin' the gold tooth, 'Lo goose
Only way I be gettin' the G off was drug loot
And let's start it like this, son
Rollin' with this one and that one
Pullin' out gats for fun
''')

['rap']


In [31]:
# Patti LaBelle - My Friend
svmLyricClassifier(
'''
The thought of you helps me carry on
When I feel all hope is gone
I see the world wit brand new eyes
Your love has made me realize
My future looks bright to me
Oh because you are my friend
I've been looking around and you were here all the time
I've been around and around and around and around
I've been looking around and you were here all the time
''')

['soul']


#### Deploy XGB model on new text

In [32]:
# Process and predict genre of text (XGB)
def xgbLyricClassifier(text):
    text = preprocessText(text)
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
    text = vect.transform(pd.Series(text))
    return(xgb.predict(text))

In [33]:
# Trippie Redd - Poles1469
xgbLyricClassifier('''
Scum Gang!
I'm toting big shit (big shit)
Get your wig split (wig split)
4-5, I call that the big bitch (big stick)
We don't miss shit (miss shit)
Get your ribs hit (limbs hit)
Hollow tip, hit you with a cross hit (yeaaah)
Big drum on the gun, that's a pump with a red beam
Catch him in the bed while he sleep, that's a wet dream
(That's a wet dream)
Close range headshots, fuck where the vest be
Put a hole in his head, he a dolphin (you a dolphin)
Pull up to the block, you don’t want it (that's that dummy shit)
Pull up with the Glock, you won't want it (brrrah!, stoopid)
Scum Gang, bitch, we on it, you don't want it (ooh, ooh)
1400, bitch, you know I tote a pole
With 6ix9ine, bitch, you know we tote them poles
Tote a pole, tote a pole, tote a pole
You talk down, we gon' let that shit go (grrrat!)
Let it blow, let it blow, let it blow (let that bitch go)
Got the scope, got the scope, got the scope
.223's and I put 'em to your nose (to your nose)
Let it go, bitch, we let that shit go
''')

array(['rap'], dtype=object)