## Creating Naive Bayes Classifier using only NumPy and Pandas

In [20]:
import numpy as np
import pandas as pd

In [21]:
def filter_col(df):
    _df = pd.read_csv(df)
    return _df.drop("Unnamed: 0", axis = 1)

In [22]:
def add_genre(df, genre):
    df['genre'] = genre
    return df

In [23]:
lyrics_df = pd.concat([
    add_genre(pd.read_csv("lyrics_datasets\ArianaGrande.csv"), 'pop'),
    add_genre(filter_col("lyrics_datasets\CardiB.csv"), 'rap'),
    add_genre(filter_col("lyrics_datasets\EdSheeran.csv"), 'pop'),
    add_genre(filter_col("lyrics_datasets\Eminem.csv"), 'rap'),
    add_genre(filter_col(r"lyrics_datasets\NickiMinaj.csv"), 'rap'),
    add_genre(filter_col("lyrics_datasets\TaylorSwift.csv"), 'pop'),
], axis = 0)

In [24]:
lyrics_df.drop(['Artist', 'Title', 'Date', 'Year', 'Album'], axis = 1, inplace=True)

In [25]:
lyrics_df.isna().sum()

Lyric    6
genre    0
dtype: int64

In [26]:
lyrics_df.dropna(inplace=True)

In [27]:
lyrics_df.head()

Unnamed: 0,Lyric,genre
0,thought i'd end up with sean but he wasn't a m...,pop
1,yeah breakfast at tiffany's and bottles of bub...,pop
2,you you love it how i move you you love it how...,pop
3,ariana grande nicki minaj i've been here all ...,pop
4,right now i'm in a state of mind i wanna be in...,pop


In [28]:
from sklearn.model_selection import train_test_split

X = lyrics_df['Lyric']
y = lyrics_df['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:

from nltk.corpus import stopwords
import string


In [30]:
def clean(lyric):
    nopunc = ''.join([char for char in lyric if char not in string.punctuation])
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [31]:
lyrics_df.to_csv('lyrics.csv', sep=',')

In [32]:
lyrics_df['genre'].value_counts(normalize=True)

pop    0.540581
rap    0.459419
Name: genre, dtype: float64

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer =  TfidfVectorizer(stop_words='english')
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [43]:
tfidf_vectorizer.vocabulary_['happy']

9898

In [45]:
tfidf_vectorizer.idf_[9898]

4.141775643617753

In [61]:
lst = X_train.toarray()[0]

In [62]:
for word in range(len(lst)):
    if (lst[word] != 0):
        print(lst[word])

0.10875569761723804
0.07478830450442232
0.036550893346083434
0.05082351383617261
0.07151203573503453
0.07793222155094921
0.13663297757687332
0.025798678942807494
0.031149884260393838
0.04467779147830858
0.0996096643146941
0.15622901531752154
0.04045083043218156
0.6008389155440812
0.05718994067805746
0.4055456174754732
0.049963650630360124
0.08583413079201159
0.07421575697108532
0.09487036669693713
0.12568610077934267
0.10015622584898867
0.03700645881618444
0.05010308011696966
0.0182207499127651
0.08722598630760962
0.04604105825910031
0.07225718326775896
0.18712067584993605
0.1926148190925277
0.05671583269457126
0.04505296939155293
0.09334396269596842
0.15359578977417215
0.07478830450442232
0.09111998994406313
0.04955409235996377
0.062475799830109
0.06211861220638274
0.06782466307788956
0.08583413079201159
0.07793222155094921
0.10015622584898867
0.06401128222928654
0.06401128222928654
0.047222127650445506
0.07917473582931214
0.08382454040934786
0.03616025639747288
0.04733499876876013
0.

In [65]:
X_test[5].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])