In [90]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_json('../data/lyrics_cleaned.json')

In [3]:
df.head()

Unnamed: 0,artist,title,lyrics,character_count,word_count
0,Drake,Toosie Slide,"Black leather gloves, no sequins (Yeah, yeah, ...",2371,447
1,Roddy Ricch,The Box,Pullin' out the coupe at the lot Told 'em fuc...,2689,551
2,Dua Lipa,Don't Start Now,"If you don't wanna see me Did a full 180, cra...",1576,304
3,Doja Cat,Say So,"Day and night til morning, keeping me in the m...",2203,446
4,Post Malone,Circles,"Hey, hey, hey, hey Oh, oh, oh-oh Oh, oh, oh-...",1570,318


In [6]:
df = df.drop(['title', 'character_count', 'word_count'], axis=1)

In [7]:
df.head()

Unnamed: 0,artist,lyrics
0,Drake,"Black leather gloves, no sequins (Yeah, yeah, ..."
1,Roddy Ricch,Pullin' out the coupe at the lot Told 'em fuc...
2,Dua Lipa,"If you don't wanna see me Did a full 180, cra..."
3,Doja Cat,"Day and night til morning, keeping me in the m..."
4,Post Malone,"Hey, hey, hey, hey Oh, oh, oh-oh Oh, oh, oh-..."


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6252 entries, 0 to 9474
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  6252 non-null   object
 1   lyrics  6252 non-null   object
dtypes: object(2)
memory usage: 306.5+ KB


In [88]:
text = df.iloc[np.random.randint(0, 6252), 1]
text

"{Ja Rule}  Where would I be without my baby  The thought alone might break me   And I don't wanna go crazy  But every thug needs a lady  {Ja Rule}  Girl it feel like you and I been mourning together  Inseparable, we chose pain over pleasure  For that you'll forever be a, part of me  Mind body and soul ain't no I in we (baby)  When you cry who wipes your tears  When you scared, who's telling you there's nothin to fear  Girl I'll always be there  When you need a shoulder to lean on  Never hesitate knowing you can call on, your soul-mate  And vice versa, that's why I be the first to  See Jacob's and frost your wrist up  Now you owe me, I know you're tired of being lonely  So baby girl put it on me    {Chorus 2X: Ja Rule}  Where would I be without you (uh)  I only think about you (yeah)  I know you're tired of being lonely (lonely)  So baby girl put it on me (put it on me)    {Vita}  Yo, and I appreciate the rocks and gifts that you cop me baby  And that house on the hill when you drop li

In [104]:
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words.add('baby')
parser = English()

def spacy_tokenizer(text):
    mytokens = parser(text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations ]

    return mytokens

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def clean_text(text):
    return text.strip().lower()

In [105]:
spacy_tokenizer(text)

['ja',
 'rule',
 'think',
 'break',
 'wanna',
 'crazy',
 'thug',
 'need',
 'lady',
 'ja',
 'rule',
 'girl',
 'feel',
 'like',
 'mourn',
 'inseparable',
 'choose',
 'pain',
 'pleasure',
 'forever',
 'mind',
 'body',
 'soul',
 'cry',
 'wipe',
 'tear',
 'scare',
 'tell',
 'fear',
 'girl',
 'need',
 'shoulder',
 'lean',
 'hesitate',
 'know',
 'soul',
 'mate',
 'vice',
 'versa',
 '\ufeff1',
 'jacob',
 'frost',
 'wrist',
 'owe',
 'know',
 'tire',
 'lonely',
 'girl',
 'chorus',
 '2x',
 'ja',
 'rule',
 'uh',
 'think',
 'yes',
 'know',
 'tire',
 'lonely',
 'lonely',
 'girl',
 'vita',
 'yo',
 'appreciate',
 'rock',
 'gift',
 'cop',
 'house',
 'hill',
 'drop',
 'like',
 '80',
 'payment',
 'think',
 'damn',
 'life',
 'gravy',
 'honey',
 'twice',
 'lady',
 'night',
 'warm',
 'cold',
 'world',
 'girl',
 'catch',
 'storm',
 'accept',
 'riff',
 'catch',
 'wrong',
 'respect',
 'flip',
 'love',
 'strong',
 'hit',
 'block',
 'watch',
 '10',
 '4',
 'pop',
 'asleep',
 'snuck',
 'backdoor',
 'boy',
 'junior