In [32]:
import pandas as pd
import numpy as np
import re 

df = pd.read_csv('lyrics.csv')
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [33]:
import string

def clean_text_round1(text):
    #lowercase
    text = text.lower()
    #remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    #remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[‘’“”…]', '', text)
    #removing anything with digits
    text = re.sub('\w*\d\\w*', '', text)
    return text

round1 = lambda l: clean_text_round1(str(l))

In [34]:
df['lyrics'] = pd.DataFrame(df['lyrics'].apply(round1))
df['lyrics'] = df['lyrics'].apply(lambda l: str(l)
                                                  .replace('\n', ' '))

In [35]:
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,oh baby how you doing you know im gonna cut ri...
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy its like you seem so...
2,2,honesty,2009,beyonce-knowles,Pop,if you search for tenderness it isnt hard to f...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,oh oh oh i oh oh oh i if i wrote a book about...
4,4,black-culture,2009,beyonce-knowles,Pop,party the people the people the party its popp...


In [36]:
df['word_count'] = df['lyrics'].str.split(' ').str.len()
df = df[df['word_count'] != 1]
df = df[df['genre'] != 'Not Available']

In [47]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text_round2(text):
    song = text.split(' ')
    temp = ""
    for word in song:
        word = lemmatizer.lemmatize(word)
        temp += word
        temp += " "
    return temp

round2 = lambda l: clean_text_round2(str(l))

In [48]:
df['lyrics'] = pd.DataFrame(df['lyrics'].apply(round2))
df.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,word_count
0,0,ego-remix,2009,beyonce-knowles,Pop,oh baby how you doing you know im gonna cut ri...,433
1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it like you seem so ...,258
2,2,honesty,2009,beyonce-knowles,Pop,if you search for tenderness it isnt hard to f...,170
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,oh oh oh i oh oh oh i if i wrote a book about...,520
4,4,black-culture,2009,beyonce-knowles,Pop,party the people the people the party it poppi...,312


In [37]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.5, stop_words=stopwords.words('english'))
lyrics = df['lyrics']
X = tfidfconverter.fit_transform(lyrics).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rudra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
import time
startTime = time.time()

y = df['genre']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, test_size=0.05, random_state=1)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, random_state=0)
classifier.fit(X_train, y_train)
acc_score = classifier.score(X_test, y_test)

preds = classifier.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, preds))

endTime = time.time()
elapsedTime = endTime - startTime
print("seconds elapsed: " + str(elapsedTime))
print("acc_score: " + str(acc_score))

[[  45    0    0    7    0    7   18    1   81    0  558]
 [   2    5    0   17    0    0   23    1   39    0  307]
 [   4    1    0    1    0    0    5    1   14    0   78]
 [   2    0    0  909    0    3    9    2   84    0  253]
 [   2    0    0    7    0    2   11    0   17    0  132]
 [  22    0    0    7    0   23   10    8   59    0  257]
 [  10    2    1   34    0    3  384    1   43    0  623]
 [   3    0    0   22    1    6   11   12   53    0  159]
 [  30    3    0   96    0   16   54    5  488    1 1352]
 [   6    1    0   12    0    0    4    0   34    1  127]
 [  91   13    3  105    0   20  280    9  558    1 4250]]
seconds elapsed: 8.168030977249146
acc_score: 0.5113693362313995


In [39]:
import time
startTime = time.time()

y = df['genre']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, test_size=0.05, random_state=1)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
acc_score = classifier.score(X_test, y_test)

preds = classifier.predict(X_test)
print(sklearn.metrics.confusion_matrix(y_test, preds))

endTime = time.time()
elapsedTime = endTime - startTime
print("seconds elapsed: " + str(elapsedTime))
print("acc_score: " + str(acc_score))

[[   0    0    0    3    0    0    2    0    3    0  709]
 [   0    0    0   13    0    0    6    0   23    0  352]
 [   0    0    0    0    0    0    1    0    9    0   94]
 [   0    0    0  793    0    0   14    0   72    0  383]
 [   0    0    0    2    0    0    1    0    9    0  159]
 [   0    0    0    3    0    0    6    0   13    0  364]
 [   0    0    0   22    0    0  337    0   41    0  701]
 [   0    0    0   18    0    0    5    3   57    0  184]
 [   0    0    0   59    0    0   17    0  276    0 1693]
 [   0    0    0    9    0    0    1    0    2    0  173]
 [   0    0    0   61    0    1   92    3  207    0 4966]]
seconds elapsed: 0.8567080497741699
acc_score: 0.5329376358468484
