# One-Class Naive Bayes

### Modelo 1 - Frequency of words

##### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import feather


from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif

#### Read FTR

In [2]:
#read ftr
data = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [3]:
data_y = data['polarityClass']
data = data.drop(columns=['polarityClass']) 
data.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
trainX, testX, trainy, testy = train_test_split(data, data_y, test_size=0.3, random_state=2)

In [5]:
testy.dtypes

dtype('uint8')

In [6]:
trainy = trainy.astype('int8')
print(trainy.dtypes)

int8


In [7]:
testy = testy.astype('int8')
print(testy.dtypes)

int8


In [8]:
trainy[trainy != 0] = -1 #anomaly
trainy[trainy == 0] = 1 #normal

testy[testy != 0] = -1 #anomaly
testy[testy == 0] = 1 #normal

In [9]:
testy.value_counts()

-1    12832
 1     2041
Name: polarityClass, dtype: int64

In [10]:
trainy.value_counts()

-1    30021
 1     4682
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 0

In [11]:
# Create model with best parameters
model = MultinomialNB(alpha=1)

# Fit the best model
model.fit(trainX,trainy)
model

MultinomialNB(alpha=1)

In [12]:
predictions = model.predict(testX)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(testy,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted     -1     1    All
Actual                       
-1         12189   643  12832
1            913  1128   2041
All        13102  1771  14873


In [13]:
# Accuracy
score = accuracy_score(testy,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.895381


In [14]:
# Report
print(classification_report(testy,predictions))

              precision    recall  f1-score   support

          -1       0.93      0.95      0.94     12832
           1       0.64      0.55      0.59      2041

    accuracy                           0.90     14873
   macro avg       0.78      0.75      0.77     14873
weighted avg       0.89      0.90      0.89     14873



#### Read FTR

In [15]:
#read ftr
data2 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [16]:
data2_y = data2['polarityClass']
data2 = data2.drop(columns=['polarityClass']) 
data2.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
train2X, test2X, train2y, test2y = train_test_split(data2, data2_y, test_size=0.3, random_state=2)

In [18]:
train2y = train2y.astype('int8')
print(train2y.dtypes)

int8


In [19]:
test2y = test2y.astype('int8')
print(test2y.dtypes)

int8


In [20]:
train2y[train2y != 1] = -1 #anomaly
train2y[train2y == 1] = 1 #normal

test2y[test2y != 1] = -1 #anomaly
test2y[test2y == 1] = 1 #normal

In [21]:
test2y.value_counts()

-1    13773
 1     1100
Name: polarityClass, dtype: int64

In [22]:
train2y.value_counts()

-1    32193
 1     2510
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 1

In [23]:
# Create model with best parameters
model = MultinomialNB(alpha=1)

# Fit the best model
model.fit(train2X,train2y)
model

MultinomialNB(alpha=1)

In [24]:
predictions = model.predict(test2X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test2y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted     -1    1    All
Actual                      
-1         13283  490  13773
1            928  172   1100
All        14211  662  14873


In [25]:
# Accuracy
score = accuracy_score(test2y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.904659


In [26]:
# Report
print(classification_report(test2y,predictions))

              precision    recall  f1-score   support

          -1       0.93      0.96      0.95     13773
           1       0.26      0.16      0.20      1100

    accuracy                           0.90     14873
   macro avg       0.60      0.56      0.57     14873
weighted avg       0.88      0.90      0.89     14873



#### Read FTR

In [27]:
#read ftr
data3 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [28]:
data3_y = data3['polarityClass']
data3 = data3.drop(columns=['polarityClass']) 
data3.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
train3X, test3X, train3y, test3y = train_test_split(data3, data3_y, test_size=0.3, random_state=2)

In [30]:
train3y = train3y.astype('int8')
print(train3y.dtypes)

int8


In [31]:
test3y = test3y.astype('int8')
print(test3y.dtypes)

int8


In [32]:
train3y[train3y != 2] = -1 #anomaly
train3y[train3y == 2] = 1 #normal

test3y[test3y != 2] = -1 #anomaly
test3y[test3y == 2] = 1 #normal

In [33]:
test3y.value_counts()

 1    11732
-1     3141
Name: polarityClass, dtype: int64

In [34]:
train3y.value_counts()

 1    27511
-1     7192
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 2

In [35]:
# Create model with best parameters
model = MultinomialNB(alpha=1)

# Fit the best model
model.fit(train3X,train3y)
model

MultinomialNB(alpha=1)

In [36]:
predictions = model.predict(test3X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test3y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted    -1      1    All
Actual                       
-1         1899   1242   3141
1           805  10927  11732
All        2704  12169  14873


In [37]:
# Accuracy
score = accuracy_score(test3y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.862368


In [38]:
# Report
print(classification_report(test3y,predictions))

              precision    recall  f1-score   support

          -1       0.70      0.60      0.65      3141
           1       0.90      0.93      0.91     11732

    accuracy                           0.86     14873
   macro avg       0.80      0.77      0.78     14873
weighted avg       0.86      0.86      0.86     14873

