## Creating Naive Bayes Classifier using only NumPy and Pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
def filter_col(df):
    _df = pd.read_csv(df)
    return _df.drop("Unnamed: 0", axis = 1)

In [3]:
def add_genre(df, genre):
    df['genre'] = genre
    return df

In [4]:
lyrics_df = pd.concat([
    add_genre(pd.read_csv("lyrics_datasets\ArianaGrande.csv"), 'pop'),
    add_genre(filter_col("lyrics_datasets\CardiB.csv"), 'rap'),
    add_genre(filter_col("lyrics_datasets\EdSheeran.csv"), 'pop'),
    add_genre(filter_col("lyrics_datasets\Eminem.csv"), 'rap'),
    add_genre(filter_col(r"lyrics_datasets\NickiMinaj.csv"), 'rap'),
    add_genre(filter_col("lyrics_datasets\Drake.csv"), 'rap'),
    add_genre(filter_col("lyrics_datasets\TaylorSwift.csv"), 'pop'),
    add_genre(filter_col("lyrics_datasets\KatyPerry.csv"), 'pop')
], axis = 0)

In [5]:
lyrics_df.drop(['Artist', 'Title', 'Date', 'Year', 'Album'], axis = 1, inplace=True)

In [6]:
lyrics_df.head()

Unnamed: 0,Lyric,genre
0,thought i'd end up with sean but he wasn't a m...,pop
1,yeah breakfast at tiffany's and bottles of bub...,pop
2,you you love it how i move you you love it how...,pop
3,ariana grande nicki minaj i've been here all ...,pop
4,right now i'm in a state of mind i wanna be in...,pop


In [7]:
lyrics_df['genre'].value_counts()

pop    1408
rap    1385
Name: genre, dtype: int64

In [8]:
lyrics_df.isna().sum()

Lyric    9
genre    0
dtype: int64

In [9]:
lyrics_df.dropna(axis = 0, inplace=True)

In [22]:
from sklearn.model_selection import train_test_split

X = lyrics_df['Lyric']
y = lyrics_df['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from models import NaiveBayesClassifier

model = NaiveBayesClassifier(X_train, y_train)

In [24]:
preds = model.predict(X_test)

In [25]:
preds

['pop',
 'pop',
 'pop',
 'rap',
 'pop',
 'pop',
 'pop',
 'rap',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'rap',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',
 'pop',


In [26]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

         pop       0.51      1.00      0.67       280
         rap       1.00      0.01      0.02       277

    accuracy                           0.51       557
   macro avg       0.75      0.51      0.35       557
weighted avg       0.75      0.51      0.35       557

[[280   0]
 [274   3]]
