In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [34]:
df = pd.read_json('../data/lyrics_cleaned.json')

In [35]:
df.head()

Unnamed: 0,artist,title,lyrics,character_count,word_count
0,Drake,Toosie Slide,"Black leather gloves, no sequins (Yeah, yeah, ...",2371,447
1,Roddy Ricch,The Box,Pullin' out the coupe at the lot Told 'em fuc...,2689,551
2,Dua Lipa,Don't Start Now,"If you don't wanna see me Did a full 180, cra...",1576,304
3,Doja Cat,Say So,"Day and night til morning, keeping me in the m...",2203,446
4,Post Malone,Circles,"Hey, hey, hey, hey Oh, oh, oh-oh Oh, oh, oh-...",1570,318


In [36]:
df.reset_index(drop=True,inplace=True)

In [37]:
df.iloc[6247,:]['lyrics']

">Freaky Tahverses Mr. CheeksIntro:[Freaky Tah]Everybody's buckin, don't give it a damneverbody...everbody(echo)Verse 1[Mr.Cheeks]Straight from cop killer Queensa juvenile named Jack >Jackat the age of 17 >uh huhyo this kid Jack started slingin crack >started slingin crackhe's on the road to riches >richesbaggin bitches >bitcheshe's in clubs takin pitchersdrink your finger always into sesshis lifestyles buckwild honey child >yeahgot a shorty named Val >Valshe stays on the Isle >ahhhhe started slingin at the age of 17 >uhh huhhis hearts made of steel >eh yokid his minds full of green >full of greenhe got his first ouncemade a grand 400 >hundred3 bills to get freshhe other bills to get blunted >to be bluntedand wit the letter G >Ghe bought the letter O >Onext thing you know manhe's rakin in the dough >rakin in the doughhe put his people downcuz say thats only right >aightfor dem to get keysdem and dem is mad tight >is mad tightset-up organation organize on the block>organize...blocknobod

In [38]:
df = df.drop(['character_count', 'word_count'], axis=1)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6252 entries, 0 to 6251
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  6252 non-null   object
 1   title   6252 non-null   object
 2   lyrics  6252 non-null   object
dtypes: object(3)
memory usage: 146.7+ KB


In [40]:
df.head()

Unnamed: 0,artist,title,lyrics
0,Drake,Toosie Slide,"Black leather gloves, no sequins (Yeah, yeah, ..."
1,Roddy Ricch,The Box,Pullin' out the coupe at the lot Told 'em fuc...
2,Dua Lipa,Don't Start Now,"If you don't wanna see me Did a full 180, cra..."
3,Doja Cat,Say So,"Day and night til morning, keeping me in the m..."
4,Post Malone,Circles,"Hey, hey, hey, hey Oh, oh, oh-oh Oh, oh, oh-..."


In [41]:
df.tail()

Unnamed: 0,artist,title,lyrics
6247,Lost Boyz,Lifestyles Of The Rich And Shameless,>Freaky Tahverses Mr. CheeksIntro:[Freaky Tah]...
6248,DJ Quik,Safe + Sound,"~Static~ ""Quik you're not a gangster we're not..."
6249,Ty Herndon,What Mattered Most,I thought I knew the girl so well If she was ...
6250,Madonna,Bedtime Story,Today Is the last day That I'm using words ...
6251,Livin' Joy,Dreamer,"Love, life and laughter Is all I beleive. My..."


In [None]:
# class predictors(TransformerMixin):
#     def transform(self, X, **transform_params):
#         return [clean_text(text) for text in X]

#     def fit(self, X, y=None, **fit_params):
#         return self

#     def get_params(self, deep=True):
#         return {}

# def clean_text(text):
#     return text.strip().lower()

In [None]:
# idx = np.random.randint(0, 6252)
# text = df.iloc[idx, 1]
# text

In [91]:
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()


def remove_bracketed(song):
    text = re.sub("\[.*?\]", '', song)
    text = re.sub("\(.*?\)", '', text)
    text = re.sub("\{.*?\}", '', text)
    
    return text


def spacy_tokenizer(text, use_stopwords=True, custom_stopwords=set()):
    text = remove_bracketed(text)
    mytokens = parser(text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    if use_stopwords:
        mytokens = [word for word in mytokens if word not in (stop_words | custom_stopwords) and word not in punctuations ]
    else:
        mytokens = [word for word in mytokens if word not in punctuations ]

    
    return mytokens


def tokenize_column(dataframe, column_name, use_stopwords=True, custom_stopwords=set()):
    df_ = dataframe.reset_index(drop=True)
    token_list = []
    for song in df_[column_name]:
        token_list.append(spacy_tokenizer(song, use_stopwords, custom_stopwords))
    
    return token_list



In [103]:
lyric_stopwords = set(['hey', 'baby', 'woo', 'ha', 'like', 'oh', 'ooh', 'woah'])
df['lyrics_tokens'] = tokenize_column(df, 'lyrics', use_stopwords=True, custom_stopwords=lyric_stopwords)

In [104]:
df.head(10)

Unnamed: 0,artist,title,lyrics,lyrics_tokens
0,Drake,Toosie Slide,"Black leather gloves, no sequins (Yeah, yeah, ...","[black, leather, glove, sequin, buckles, jacke..."
1,Roddy Ricch,The Box,Pullin' out the coupe at the lot Told 'em fuc...,"[pullin, coupe, lot, told, 'em, fuck, 12, fuck..."
2,Dua Lipa,Don't Start Now,"If you don't wanna see me Did a full 180, cra...","[wanna, 180, crazy, thinking, way, heartbreak,..."
3,Doja Cat,Say So,"Day and night til morning, keeping me in the m...","[day, night, til, morning, moment, let, know, ..."
4,Post Malone,Circles,"Hey, hey, hey, hey Oh, oh, oh-oh Oh, oh, oh-...","[turn, til, upside, bad, guy, proud, try, beli..."
5,Justin Bieber Featuring Quavo,Intentions,"Picture perfect, you don't need no filter Gor...","[picture, perfect, need, filter, gorgeous, 'em..."
6,DaBaby Featuring Roddy Ricch,ROCKSTAR,"Woo, woo I pull up like How you pull up, Bab...","[pull, pull, pull, pull, pull, let, brand, new..."
7,Future Featuring Drake,Life Is Good,Workin' on the weekend like usual Way off in ...,"[workin, weekend, usual, way, deep, end, usual..."
8,Billie Eilish,everything i wanted,I had a dream I got everything I wanted Not ...,"[dream, want, think, bein, honest, nightmare, ..."
9,Camila Cabello Featuring DaBaby,My Oh My,"Ha, ha, ha, ha They say he likes a good time ...","[good, time, come, alive, midnight, mama, trus..."


In [107]:
idx = np.random.randint(0, 6252)

In [108]:
print(df.iloc[idx, 0], ' - ', df.iloc[idx, 1],'\n\n-----\n\n', df.iloc[idx, 2], '\n\n-----\n\n', df.iloc[idx, 3])

Uncle Kracker  -  Smile 

-----

 You're better than the best  I'm lucky just to linger in your light  Cooler then the flip side of my pillow, that's right   Completely unaware  Nothing can compare to where you send me,  Lets me know that it's OK, yeah it's OK  And the moments where my good times start to fade    You make me smile like the sun  Fall out of bed, sing like a bird  Dizzy in my head, spin like a record  Crazy on a Sunday night  You make me dance like a fool  Forget how to breathe  Shine like gold, buzz like a bee  Just the thought of you can drive me wild  Oh, you make me smile    Even when you're gone  Somehow you come along  Just like a flower poking through the sidewalk crack   and just like that  You steal away the rain and just like that...    You make me smile like the sun  Fall out of bed, sing like a bird  Dizzy in my head, spin like a record  Crazy on a Sunday night  You make me dance like a fool  Forget how to breathe  Shine like gold, buzz like a bee  Just the t

In [None]:
df.