## Naive Bayes Classifier from scratch (only NumPy and Pandas)

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('datasets/lyrics.csv')

In [2]:
def filter_col(df):
    _df = pd.read_csv(df)
    return _df.drop("Unnamed: 0", axis = 1)

def add_genre(df, genre):
    df['genre'] = genre
    return df

In [3]:
lyrics_df = pd.concat([
    add_genre(pd.read_csv("datasets\lyrics_datasets\EdSheeran.csv"), 'pop'),
    add_genre(filter_col("datasets\lyrics_datasets\CardiB.csv"), 'rap'),
    add_genre(filter_col("datasets\lyrics_datasets\Eminem.csv"), 'rap'),
    add_genre(filter_col("datasets\lyrics_datasets\TaylorSwift.csv"), 'pop'),
], axis = 0)

In [4]:
lyrics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371 entries, 0 to 478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  296 non-null    float64
 1   Artist      1371 non-null   object 
 2   Title       1371 non-null   object 
 3   Album       982 non-null    object 
 4   Year        983 non-null    float64
 5   Date        983 non-null    object 
 6   Lyric       1367 non-null   object 
 7   genre       1371 non-null   object 
dtypes: float64(2), object(6)
memory usage: 96.4+ KB


In [5]:
lyrics_df.head()

Unnamed: 0.1,Unnamed: 0,Artist,Title,Album,Year,Date,Lyric,genre
0,0.0,Ed Sheeran,Shape of You,÷ (Divide),2017.0,2017-01-06,the club isn't the best place to find a lover ...,pop
1,1.0,Ed Sheeran,Perfect,÷ (Divide),2017.0,2017-03-03,i found a love for me oh darling just dive rig...,pop
2,2.0,Ed Sheeran,Castle on the Hill,÷ (Divide),2017.0,2017-01-06,when i was six years old i broke my leg i was ...,pop
3,3.0,Ed Sheeran,Happier,÷ (Divide),2017.0,2017-03-03,walking down 9th and park i saw you in another...,pop
4,4.0,Ed Sheeran,Supermarket Flowers,÷ (Divide),2017.0,2017-03-03,i took the supermarket flowers from the window...,pop


In [6]:
# dropping other columns, as predictions are to be made only based on lyrics

lyrics_df.drop(['Unnamed: 0','Artist', 'Title', 'Date', 'Year', 'Album'], axis = 1, inplace=True)

In [7]:
lyrics_df.isna().sum()

Lyric    4
genre    0
dtype: int64

In [8]:
lyrics_df.dropna(inplace=True)

In [9]:
from nltk.corpus import stopwords
import string

def clean(lyric):
    ''' 
    removes punctuation and stopwords
    '''
    nopunc = ''.join([char for char in lyric if char not in string.punctuation])
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

In [10]:
lyrics_df['Lyric'] = lyrics_df['Lyric'].apply(lambda x: clean(x))

In [11]:
lyrics_df.head()

Unnamed: 0,Lyric,genre
0,club isnt best place find lover bar go friends...,pop
1,found love oh darling dive right follow lead w...,pop
2,six years old broke leg running brother friend...,pop
3,walking 9th park saw anothers arm month weve a...,pop
4,took supermarket flowers windowsill threw day ...,pop


In [12]:
lyrics_df.to_csv('cleaned_lyrics.csv')

In [13]:
df = pd.read_csv('datasets/cleaned_lyrics.csv')

df.rename(columns = {'genre':'Genre'}, inplace = True)

In [14]:
from sklearn.model_selection import train_test_split

X = df['Lyric']
y = df['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:

# class for multinomial naive bayes classifier
class NaiveBayesClassifier:
    def __init__(self, alpha = 1):
        # dict of datasets split based on label
        self.category_dfs = {}

        # finds prior probabilities of all labels (P(rap) and P(pop))
        self.prior_probs = {}

        # number of words in all documents under each label (N(rap) and N(pop))
        self.label_counts = {}

        # smoothing parameter (eliminates zero probability)
        self.alpha = alpha

        # dict of probabilities of a particular word, given that they are from that label (P(w/rap), p(w/pop))
        self.label_parameters = {}
    
    def fit(self, X, y):
        ''' 
        fits the model based on training data
        '''
        df = pd.concat([X,y], axis = 1)
        self.fit_(df)

    def length_of_doc(self, val):
        ''' 
        number of words in a document
        '''
        return len(val.split())

    def clean(self, lyric):
        ''' 
        removes stopwords and punctuation
        '''
        nopunc = ''.join([char for char in lyric if char not in string.punctuation])
        return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

    def fit_(self, df):
        ''' 
        fits dataframe, calculates all parameters for model
        '''
        y = df.iloc[:,-1]
        X = df.iloc[:,:2]
        X = X[X.columns[0]]
        self.target = df.columns[-1]
        self.labels = y.unique()

        # finding set of vocablulary
        vocab = []
        for doc in X:
            for word in doc.split():
                if word not in vocab:
                    vocab.append(word)
        self.vocabulary = vocab
        self.n_vocab = len(self.vocabulary) # N(vocab)

        # count vectorisation
        wc_df = self.generate_word_count(X)
        X.index = X.index.sort_values()
        y.index = y.index.sort_values()
        clean_df = pd.concat([X, y, wc_df], axis = 1)
        clean_df = clean_df.dropna()

        # splitting training data based on label
        for label in self.labels:
            label_df = clean_df[clean_df[self.target] == label]
            self.category_dfs[label] = label_df

        # calculating prior probabilities and number of words in each doc
        for label in self.labels:
            label_prob = len(self.category_dfs[label])/len(clean_df)
            self.prior_probs[label] = label_prob
            words_per_label = self.category_dfs[label][self.target].apply(lambda x: self.length_of_doc(x))
            self.label_counts[label] = words_per_label.sum()
        
        # calculating p(w/label) for every word in every label
        for label in self.labels:
            parameters_label = {unique_word:0 for unique_word in self.vocabulary}
            for word in self.vocabulary:
                n_word_given_label = self.category_dfs[label][word].sum()
                p_word_given_label = (n_word_given_label + self.alpha) / (self.label_counts[label] + self.alpha*self.n_vocab)
                parameters_label[word] = p_word_given_label
            self.label_parameters[label] = parameters_label

    def _predict(self, doc):
        ''' 
        gives prediction for a single document
        '''
        doc = self.clean(doc)
        doc = doc.split()
        label_scores = {}

        # p(label) * p(w1/label) * p(w2/label) * .....
        for label in self.labels:
            p_label_given_doc = self.prior_probs[label]
            label_scores[label] = p_label_given_doc
        for word in doc:
            for label in self.labels:
                if word in self.label_parameters[label]:
                    # multiplying by 1000 to prevent probabilities to approximate to 0 if they become too small 
                    label_scores[label] *= (self.label_parameters[label][word]*(1000))
        return self.max_dict(label_scores)

    def max_dict(self, d):
        ''' 
        finds key with max value in a dictionary
        '''
        rev = dict(map(reversed, d.items()))
        return rev[max(list(d.values()))]
    
    def predict(self, X):
        ''' 
        predicts for a set of docs
        '''
        preds = []
        for i in X:
            preds.append(self._predict(i))
        return preds

    def generate_word_count(self, X):
        ''' 
            count-vectorizes the documents in corpus
        '''
        word_count = {word: [0] * len(X) for word in self.vocabulary}
        for index, doc in enumerate(X):
            for word in doc.split():
                word_count[word][index] += 1
        word_count = pd.DataFrame(word_count)
        return word_count

In [16]:


model = NaiveBayesClassifier()
model.fit(X_train, y_train)

In [17]:
preds = model.predict(X_test)

  label_scores[label] *= (self.label_parameters[label][word]*(1000))


In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         pop       0.59      0.99      0.73       160
         rap       0.50      0.02      0.03       114

    accuracy                           0.58       274
   macro avg       0.54      0.50      0.38       274
weighted avg       0.55      0.58      0.44       274



Accuracy may be improved usiong Tfidf instead of Count vectorizing