Final Project
------

Group Name
-----

My Machine Learning Romance

Student Names
----

1. Rebecca Reilly
2. Viviana Marquez
3. Victoria Suarez
4. Fiorella Tenorio
5. Katja Wittfoth 

Load Data
-----

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
import re
from imblearn.over_sampling import SMOTE 

In [2]:
def clean_lyrics(text):
    text = re.sub('\n', ' ', text)  # removes new lines
    text = re.sub('\d', '', text)  # removes numbers
    text = re.sub('\t', '', text)  # removes tabs
    text = re.sub('[^A-Za-z ]', '', text)  # removes special character
    text = re.sub('\[Verse\s*\d*:]', ' ', text)  # remove verse indicators
    text = re.sub('\s+', ' ', text)
    words = text.split(" ")
    words = [w for w in words if len(w) > 2]  # removes a, an, to, at, be, ...
    return ' '.join(words)

In [3]:
df = pd.read_csv('../Final Project/Data/lyrics.csv')

In [4]:
df = df[pd.notnull(df['lyrics'])]
df = df[df['genre']!= 'Not Available']
df = df[df['genre']!= 'Other']
df['lyrics_clean'] = df.lyrics.apply(clean_lyrics)

In [5]:
df.genre.unique()

array(['Pop', 'Hip-Hop', 'Rock', 'Metal', 'Country', 'Jazz', 'Electronic',
       'Folk', 'R&B', 'Indie'], dtype=object)

In [6]:
df_clean = df[(df.genre == 'Country') | (df.genre == 'Metal') \
              | (df.genre == 'Hip-Hop') | (df.genre == 'Jazz') \
              | (df.genre == 'Electronic')]

Fit scikit-learn model
----

In [None]:
train, test = train_test_split(df_clean, test_size=0.2)
train_lyrics = train['lyrics_clean']
train_genre = train['genre']

In [None]:
vectorizer = CountVectorizer(stop_words='english', 
                             lowercase=False, 
                             analyzer = 'word')
train_lyrics = vectorizer.fit_transform(train_lyrics)
test_lyrics = vectorizer.transform(test['lyrics_clean'])

In [None]:
# sm = SMOTE()
# train_lyrics, train_genre = sm.fit_sample(X = train_lyrics, y = train_genre)

In [None]:
pipeline = Pipeline([('clf', MultinomialNB(alpha=.1))]) 

In [None]:
grid_params = dict(clf__alpha = np.arange(start=0, stop = 0.1, step = 0.01))

gs = GridSearchCV(estimator=pipeline,  
                  param_grid=grid_params,
                  scoring='accuracy',
                  cv=10)

In [None]:
gs.fit(train_lyrics, train_genre)
f"{gs.score(test_lyrics, test['genre']):.4f}"

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_.steps[0][1]  # best model

In [None]:
pred = gs.best_estimator_.predict(test_lyrics)

Evaluation Metric
----

In [None]:
con_mat = confusion_matrix(test['genre'], pred)

In [None]:
fig = plt.figure(figsize=(15, 15))
sns.heatmap(con_mat.T, square=True, annot=True, fmt='d', cbar=True,
           xticklabels=df_clean.genre.unique(), yticklabels=df_clean.genre.unique(), cmap="Blues")

In [None]:
round(accuracy_score(pred, test['genre']) ,3)