# Complement Naive Bayes

## Modelo 1 - Frequency of words

##### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import feather

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif

#### Read FTR

In [2]:
#read ftr
train_m1 = pd.read_feather( '../dataset/train_m1.ftr', columns=None, use_threads=True)

In [3]:
train_m1 = train_m1.drop(columns=['level_0'])
train_m1.head()

Unnamed: 0,kara,harris,mani,levers,audio,zorro,sneaky,pennies,misinterpret,boss,...,deaths,leaved,obtain,glory,unenthusiastic,desire,proctor,adapted,persuade,polarityClass
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
#read ftr
test_m1 = pd.read_feather( '../dataset/test_m1.ftr', columns=None, use_threads=True)

In [5]:
test_m1 = test_m1.drop(columns=['level_0'])
test_m1.head()

Unnamed: 0,kara,harris,mani,levers,audio,zorro,sneaky,pennies,misinterpret,boss,...,deaths,leaved,obtain,glory,unenthusiastic,desire,proctor,adapted,persuade,polarityClass
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [6]:
#get target
train_m1_target = train_m1['polarityClass']
train_m1 = train_m1.drop(columns=['polarityClass']) 

In [7]:
#get target
test_m1_target = test_m1['polarityClass']
test_m1 = test_m1.drop(columns=['polarityClass']) 

### Create and fit the model

In [8]:
# Create model with best parameters
naive_bayes_model = ComplementNB()

# Fit the best model
naive_bayes_model.fit(train_m1, train_m1_target)
naive_bayes_model

ComplementNB()

In [9]:
predictions = naive_bayes_model.predict(test_m1)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test_m1_target,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted     0     1      2    All
Actual                             
0          1001   193    227   1421
1           570   353    579   1502
2          1376  1213  26119  28708
All        2947  1759  26925  31631


In [10]:
# Accuracy
score = accuracy_score(test_m1_target,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.868547


In [11]:
# Report
print(classification_report(test_m1_target,predictions))

              precision    recall  f1-score   support

           0       0.34      0.70      0.46      1421
           1       0.20      0.24      0.22      1502
           2       0.97      0.91      0.94     28708

    accuracy                           0.87     31631
   macro avg       0.50      0.62      0.54     31631
weighted avg       0.91      0.87      0.88     31631



In [None]:
# Validate model using best model and cross validation
pecc_cnb = cross_val_score(naive_bayes_model, train_m1, train_m1_target, cv = 5).mean()
pecc_cnb

## Modelo 2 - Existence of words

#### Read FTR

In [None]:
#read ftr
train_m2 = pd.read_feather( '../dataset/train_m2.ftr', columns=None, use_threads=True)

In [None]:
train_m2 = train_m2.drop(columns=['level_0'])
train_m2.head()

In [None]:
#read ftr
test_m2 = pd.read_feather( '../dataset/test_m2.ftr', columns=None, use_threads=True)

In [None]:
test_m2 = test_m2.drop(columns=['level_0'])
test_m2.head()

In [None]:
#get target
train_m2_target = train_m2['polarityClass']
train_m2 = train_m2.drop(columns=['polarityClass']) 

In [None]:
#get target
test_m2_target = test_m2['polarityClass']
test_m2 = test_m2.drop(columns=['polarityClass']) 

### Create and fit the model

In [None]:
# Create model with best parameters
naive_bayes_model = ComplementNB()

# Fit the best model
naive_bayes_model.fit(train_m2, train_m2_target)
naive_bayes_model

In [None]:
predictions = naive_bayes_model.predict(test_m2)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test_m2_target,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

In [None]:
# Accuracy
score = accuracy_score(test_m2_target,predictions)
print('Accuracy:{0:f}'.format(score))

In [None]:
# Report
print(classification_report(test_m2_target,predictions))

In [None]:
# Validate model using best model and cross validation
pecc_cnb = cross_val_score(naive_bayes_model, train_m2, train_m2_target, cv = 5).mean()
pecc_cnb