In [137]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import metrics

In [138]:
# loading csv

original_df = pd.read_csv('data.csv')

In [139]:
# cleaning lyrics

import re

original_df['lyrics_clean'] = \
original_df['Lyric'].map(lambda x: re.sub('[,\.!?]','',x))
original_df['lyrics_clean'] = \
original_df['lyrics_clean'].map(lambda x: x.lower())

In [140]:
# removing uncleaned lyrics

original_df.drop('Lyric', axis=1, inplace=True)
original_df

Unnamed: 0,Genre,lyrics_clean
0,Rock,handy dandy controversy surrounds him he been ...
1,Rock,same bed but it feels just a little bit bigger...
2,Pop,saw ya out by the pool on the 8th of july didn...
3,Hip Hop,got my shit down super tight got my shit down ...
4,Hip Hop,[talk:] ay soulja boy in da buildin ay i want ...
...,...,...
54995,unknown,i can't believe what you did to me down on my ...
54996,unknown,have all the songs been written have all the t...
54997,unknown,everything you do you do so right the clothes ...
54998,unknown,(trecho) (rule number two understanding what y...


In [141]:
# TF-IDF, removing stop words and words that appear in less than 30 of documents
# experimented min_df from 5-40 (increments of 5) -- 30 was the best

v = TfidfVectorizer(min_df = 30, stop_words = 'english')
x = v.fit_transform(original_df['lyrics_clean'])

In [142]:
# turning TF-IDF into dataframe

tf_idf = pd.DataFrame(x.toarray(), columns = v.get_feature_names_out())

In [143]:
# combining original_df with TF-IDF

features = pd.concat([original_df, tf_idf], axis = 1)

In [144]:
# dropping the cleaned lyrics from the features dataset

features.drop('lyrics_clean', axis = 1, inplace = True)

In [145]:
# separating testing(unknown) and training(df) set

unknown = features.loc[features['Genre'] == 'unknown']
df = features.head(features.shape[0] - unknown.shape[0])

In [146]:
# creating the holdout dataset

holdout = df.sample(frac = 0.1, random_state = 1)
df = df.drop(holdout.index)
df

Unnamed: 0,Genre,000,10,100,1000,11,12,123,13,14,...,zeros,zevon,zig,zion,zip,zipper,zombie,zone,zoo,zoom
0,Rock,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Rock,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hip Hop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Hip Hop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Rock,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,Hip Hop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,Pop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,Rock,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
# split df into training and testing

from sklearn.model_selection import train_test_split

t = 'Genre'
df[t]
training, testing = train_test_split(df, test_size = .2, stratify = df[t], random_state = 1)

In [148]:
# build model

from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(training.drop('Genre', axis = 1), training['Genre'])
pred = classifier.predict(testing.drop('Genre', axis = 1))

In [149]:
# compute the accuracy score of df classifier

training_score = metrics.accuracy_score(testing['Genre'], pred)
print("accuracy:   %0.3f" % training_score)

accuracy:   0.659


In [150]:
# apply model to holdout dataset

pred_holdout = classifier.predict(holdout.drop('Genre', axis = 1))

# compute the performance measures of holdout

estimated_accuracy = metrics.accuracy_score(holdout['Genre'], pred_holdout)
print("accuracy:   %0.3f" % estimated_accuracy)

pd.Series(estimated_accuracy).to_csv('ea.csv', index=False, header=False)

accuracy:   0.645


In [151]:
# predicting the unknown dataset

pred_unknown = classifier.predict(unknown.drop('Genre', axis = 1))

pd.Series(pred_unknown).to_csv('pred.csv', index=False, header=False) 