# Naive Bayes

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import KFold

## Load data

In [2]:
import os
# Change directory to load preprocess script
os.chdir("..")

In [3]:
from data_preprocess import getTrainData
train_data = getTrainData(include_random=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Test

In [7]:
def cross_validation(word_vectorizer, model):
    i = 0
    n = 6
    seed = 42
    
    accuracy = np.zeros(n)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    mcc = np.zeros(n)
    
    kf = KFold(n_splits=n, shuffle=True, random_state=seed)
    
    X = train_data['text'].array
    y = train_data['label'].array

    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        X_train_cv = word_vectorizer.fit_transform(X_train)
        X_val_cv = word_vectorizer.transform(X_val)
        
        model.fit(X_train_cv, y_train)
        predictions = model.predict(X_val_cv)

        accuracy[i] = accuracy_score(y_val, predictions)
        precision[i] = precision_score(y_val, predictions)
        recall[i] = recall_score(y_val, predictions)
        f1[i] = f1_score(y_val, predictions)
        mcc[i] = matthews_corrcoef(y_val, predictions)
        
        i += 1
    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1), np.mean(mcc)

### Naive bayes + CountVectorizer

In [8]:
cv = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
naive_bayes = MultinomialNB()

accuracy, precision, recall, f1, mcc = cross_validation(cv, naive_bayes)

print('Naive bayes + CountVectorizer:')
print('Accuracy score: ', accuracy)
print('Precision score: ', precision)
print('Recall score: ', recall)
print('F1 score: ', f1)
print('MCC score: ', mcc)

Naive bayes + CountVectorizer:
Accuracy score:  0.9303660093630303
Precision score:  0.9190840013298559
Recall score:  0.9445744953755112
F1 score:  0.931183706510507
MCC score:  0.8615014806127612


### Naive bayes + TfidfVectorizer

In [9]:
cv = TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words='english')
naive_bayes = MultinomialNB()

accuracy, precision, recall, f1, mcc = cross_validation(cv, naive_bayes)

print('Naive bayes + TfidfVectorizer:')
print('Accuracy score: ', accuracy)
print('Precision score: ', precision)
print('Recall score: ', recall)
print('F1 score: ', f1)
print('MCC score: ', mcc)

Naive bayes + TfidfVectorizer:
Accuracy score:  0.92784082848631
Precision score:  0.9342512828450955
Recall score:  0.9235984645015577
F1 score:  0.9277208330824239
MCC score:  0.8579319792841967
