In [1]:
import pandas as pd
import numpy as np
import sklearn

In [127]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, ShuffleSplit
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

# Iris Dataset

In [51]:
with open("data/iris.names") as f:
    print(f.read())

1. Title: Iris Plants Database
	Updated Sept 21 by C.Blake - Added discrepency information

2. Sources:
     (a) Creator: R.A. Fisher
     (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
     (c) Date: July, 1988

3. Past Usage:
   - Publications: too many to mention!!!  Here are a few.
   1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
      Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
      to Mathematical Statistics" (John Wiley, NY, 1950).
   2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
      (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
      Structure and Classification Rule for Recognition in Partially Exposed
      Environments".  IEEE Transactions on Pattern Analysis and Machine
      Intelligence, Vol. PAMI-2, No. 1, 67-71.
      -- Results:
         -- very low misclassification rates (0% for t

In [52]:
col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]

In [53]:
df = pd.read_csv("data/iris.data", names=col_names)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Prepare the data for training

In [130]:
features = df.iloc[:, 0:4]
labels = df['class']

In [131]:
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(labels)

LabelEncoder()

In [132]:
label_encoder.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [133]:
labels = label_encoder.transform(labels)

In [134]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [135]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=8) 

## Multinomial NB

In [136]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)

MultinomialNB()

In [137]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, MultiNB.predict(X_train))}")

y_pred_MNB = MultiNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_MNB)}")

f1 = f1_score(y_test, y_pred_MNB, average='weighted')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_MNB))

print(confusion_matrix(y_pred_MNB, y_test))

Accuracy Score of Training Set:  0.9333333333333333
Accuracy Score of Test Set: 0.9
F1 Score of Test Set: 0.899248120300752
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         9
           2       1.00      0.73      0.84        11

    accuracy                           0.90        30
   macro avg       0.92      0.91      0.90        30
weighted avg       0.93      0.90      0.90        30

[[10  0  0]
 [ 0  9  3]
 [ 0  0  8]]


## Bernoulli NB

In [63]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)

BernoulliNB()

In [64]:
y_pred_BNB = BernNB.predict(X_test)

In [65]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, BernNB.predict(X_train))}")

print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_BNB)}")

f1 = f1_score(y_test, y_pred_BNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_BNB))

Accuracy Score of Training Set:  0.3416666666666667
Accuracy Score of Test Set: 0.3
F1 Score of Test Set: 0.3
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30



  _warn_prf(average, modifier, msg_start, len(result))


## Gaussian NB

In [66]:
GaussNB = GaussianNB()
GaussNB.fit(X_train, y_train)

GaussianNB()

In [67]:
y_pred_GNB = BernNB.predict(X_test)

In [68]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, GaussNB.predict(X_train))}")

print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_GNB)}")

f1 = f1_score(y_test, y_pred_GNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_GNB))

Accuracy Score of Training Set:  0.975
Accuracy Score of Test Set: 0.3
F1 Score of Test Set: 0.3
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30



  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter Tuning

In [139]:
cv_sets = ShuffleSplit(n_splits=5, test_size=.2, random_state=8)
param_grid = {
    'alpha': [0.25, 0.5, 1, 1.5, 2], 
    'fit_prior': [False, True]
}

grid_search = GridSearchCV(estimator=MultiNB,
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=5)

In [140]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': [0.25, 0.5, 1, 1.5, 2],
                         'fit_prior': [False, True]},
             scoring='accuracy')

In [141]:
grid_search.best_params_

{'alpha': 0.25, 'fit_prior': False}

In [142]:
grid_search.best_score_

0.9666666666666668

In [144]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))

print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.9583333333333334
Accuracy Score of Test Set: 0.9333333333333333
F1 Score of Test Set: 0.9333333333333333
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.89      0.89      0.89         9
           2       0.91      0.91      0.91        11

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30

Confusion Matrix: 
 [[10  0  0]
 [ 0  8  1]
 [ 0  1 10]]


# Diabetes Dataset

In [80]:
diabetes = pd.read_csv('data/diabetes.tab.txt', delimiter = "\t")

In [81]:
diabetes.columns

Index(['AGE', 'SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y'], dtype='object')

In [145]:
features = diabetes.loc[:, diabetes.columns != 'SEX']
labels = diabetes['SEX']

In [146]:
features

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,32.1,101.00,157,93.2,38.0,4.00,4.8598,87,151
1,48,21.6,87.00,183,103.2,70.0,3.00,3.8918,69,75
2,72,30.5,93.00,156,93.6,41.0,4.00,4.6728,85,141
3,24,25.3,84.00,198,131.4,40.0,5.00,4.8903,89,206
4,50,23.0,101.00,192,125.4,52.0,4.00,4.2905,80,135
...,...,...,...,...,...,...,...,...,...,...
437,60,28.2,112.00,185,113.8,42.0,4.00,4.9836,93,178
438,47,24.9,75.00,225,166.0,42.0,5.00,4.4427,102,104
439,60,24.9,99.67,162,106.6,43.0,3.77,4.1271,95,132
440,36,30.0,95.00,201,125.2,42.0,4.79,5.1299,85,220


In [147]:
labels

0      2
1      1
2      2
3      1
4      1
      ..
437    2
438    2
439    2
440    1
441    1
Name: SEX, Length: 442, dtype: int64

In [148]:
scaler = StandardScaler().fit(features)

In [149]:
X_scaled = scaler.transform(features)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.2, random_state=8) 

In [151]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)

y_pred_BNB = BernNB.predict(X_test)
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, BernNB.predict(X_train))}")

print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_BNB)}")

f1 = f1_score(y_test, y_pred_BNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_BNB))

print("Confusion Matrix: \n", confusion_matrix(y_pred_BNB, y_test))

Accuracy Score of Training Set:  0.660056657223796
Accuracy Score of Test Set: 0.6966292134831461
F1 Score of Test Set: 0.6966292134831461
Classification Report
              precision    recall  f1-score   support

           1       0.71      0.72      0.72        47
           2       0.68      0.67      0.67        42

    accuracy                           0.70        89
   macro avg       0.70      0.70      0.70        89
weighted avg       0.70      0.70      0.70        89

Confusion Matrix: 
 [[34 14]
 [13 28]]


In [152]:
GaussNB = GaussianNB()
GaussNB.fit(X_train, y_train)

print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, GaussNB.predict(X_train))}")

y_pred_GNB = GaussNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_GNB)}")

f1 = f1_score(y_test, y_pred_GNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_GNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_GNB, y_test))

Accuracy Score of Training Set:  0.6572237960339944
Accuracy Score of Test Set: 0.7415730337078652
F1 Score of Test Set: 0.7415730337078652
Classification Report
              precision    recall  f1-score   support

           1       0.75      0.77      0.76        47
           2       0.73      0.71      0.72        42

    accuracy                           0.74        89
   macro avg       0.74      0.74      0.74        89
weighted avg       0.74      0.74      0.74        89

Confusion Matrix: 
 [[36 12]
 [11 30]]


In [154]:
cv_sets = ShuffleSplit(n_splits=5, test_size=.2, random_state=8)
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [False, True]
}

grid_search = GridSearchCV(estimator=BernNB,
                               param_grid=param_grid,
                               scoring='f1_micro',
                               cv=cv_sets)

In [155]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=8, test_size=0.2, train_size=None),
             estimator=BernoulliNB(),
             param_grid={'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                         'fit_prior': [False, True]},
             scoring='f1_micro')

In [156]:
grid_search.best_params_

{'alpha': 0.01, 'fit_prior': False}

In [157]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))

print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.6742209631728046
Accuracy Score of Test Set: 0.6853932584269663
F1 Score of Test Set: 0.6853932584269663
Classification Report
              precision    recall  f1-score   support

           1       0.71      0.68      0.70        47
           2       0.66      0.69      0.67        42

    accuracy                           0.69        89
   macro avg       0.69      0.69      0.69        89
weighted avg       0.69      0.69      0.69        89

Confusion Matrix: 
 [[32 13]
 [15 29]]


In [158]:
cv_sets = ShuffleSplit(n_splits=5, test_size=.2, random_state=8)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_search = GridSearchCV(estimator=GaussNB,
                               param_grid=param_grid,
                               scoring='f1',
                               cv=cv_sets)

In [160]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=8, test_size=0.2, train_size=None),
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='f1')

In [161]:
grid_search.best_params_

{'var_smoothing': 0.2848035868435802}

In [162]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.6543909348441926
Accuracy Score of Test Set: 0.7415730337078652
F1 Score of Test Set: 0.7415730337078652
Classification Report
              precision    recall  f1-score   support

           1       0.75      0.77      0.76        47
           2       0.73      0.71      0.72        42

    accuracy                           0.74        89
   macro avg       0.74      0.74      0.74        89
weighted avg       0.74      0.74      0.74        89

Confusion Matrix: 
 [[36 12]
 [11 30]]


In [49]:
confusion_matrix(y_test, y_pred_bestNB)

array([[54, 19],
       [19, 41]], dtype=int64)

# Breast Cancer

In [99]:
with open("data/breast-cancer-wisconsin.names") as f:
    print(f.read())

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

   3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 199

In [174]:
data = pd.read_csv("data/breast-cancer-wisconsin.data", header=None)

In [175]:
data = data[data[6] != '?']

In [176]:
data.shape

(683, 11)

In [177]:
# Preprocess

In [178]:
X = data.iloc[:, 1: -1]
y = data[10]

In [179]:
y = y.replace(2, 0)
y = y.replace(4, 1)

In [180]:
y

0      0
1      0
2      0
3      0
4      0
      ..
694    0
695    0
696    1
697    1
698    1
Name: 10, Length: 683, dtype: int64

In [181]:
X.shape

(683, 9)

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8) 

In [184]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)

print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, MultiNB.predict(X_train))}")

y_pred_MNB = MultiNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_MNB)}")

f1 = f1_score(y_test, y_pred_MNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_MNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_MNB, y_test))

Accuracy Score of Training Set:  0.8992673992673993
Accuracy Score of Test Set: 0.927007299270073
F1 Score of Test Set: 0.927007299270073
Classification Report
              precision    recall  f1-score   support

           0       0.91      0.98      0.94        84
           1       0.96      0.85      0.90        53

    accuracy                           0.93       137
   macro avg       0.93      0.91      0.92       137
weighted avg       0.93      0.93      0.93       137

Confusion Matrix: 
 [[82  8]
 [ 2 45]]


In [186]:
GaussNB = GaussianNB()
GaussNB.fit(X_train, y_train)

print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, GaussNB.predict(X_train))}")

y_pred_GNB = GaussNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_GNB)}")

f1 = f1_score(y_test, y_pred_GNB, average='micro')
print(f"F1 Score of Test Set: {f1}")

print("Classification Report")    
print(classification_report(y_test, y_pred_GNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_GNB, y_test))

Accuracy Score of Training Set:  0.9542124542124543
Accuracy Score of Test Set: 0.9854014598540146
F1 Score of Test Set: 0.9854014598540146
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        84
           1       0.98      0.98      0.98        53

    accuracy                           0.99       137
   macro avg       0.98      0.98      0.98       137
weighted avg       0.99      0.99      0.99       137

Confusion Matrix: 
 [[83  1]
 [ 1 52]]


In [125]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)

y_pred_BNB = BernNB.predict(X_test)
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, BernNB.predict(X_train))}")

print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_BNB)}")

f1 = f1_score(y_test, y_pred_BNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_BNB))

Accuracy Score of Training Set:  0.6593406593406593
Accuracy Score of Test Set: 0.6131386861313869
F1 Score of Test Set: 0.6131386861313869
Classification Report
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        84
           1       0.00      0.00      0.00        53

    accuracy                           0.61       137
   macro avg       0.31      0.50      0.38       137
weighted avg       0.38      0.61      0.47       137



  _warn_prf(average, modifier, msg_start, len(result))


In [188]:
#cv_sets = ShuffleSplit(n_splits=5, test_size=.2, random_state=8)
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_search = GridSearchCV(estimator=GaussNB,
                               param_grid=param_grid,
                               scoring='f1',
                               cv=5)

In [189]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.31...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='f1')

In [190]:
grid_search.best_params_

{'var_smoothing': 0.1}

In [191]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.9633699633699634
Accuracy Score of Test Set: 0.9927007299270073
F1 Score of Test Set: 0.9927007299270073
Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        84
           1       1.00      0.98      0.99        53

    accuracy                           0.99       137
   macro avg       0.99      0.99      0.99       137
weighted avg       0.99      0.99      0.99       137

Confusion Matrix: 
 [[84  1]
 [ 0 52]]


In [36]:
class NaiveBayes:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def prepare_train_test(self, test_size=0.2, random_state=8):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
    
    def MultinomialNB(self):
        MultiNB = MultinomialNB()
        MultiNB.fit(self.X_train, self.y_train)
        
        print(f"Accuracy Score of Training Set:  {accuracy_score(self.y_train, MultiNB.predict(self.X_train))}")

        y_pred_MNB = MultiNB.predict(self.X_test)
        print(f"Accuracy Score of Test Set: {accuracy_score(self.y_test, y_pred_MNB)}")

        f1 = f1_score(self.y_test, y_pred_MNB, average='weighted')
        print(f"F1 Score of Test Set: {f1}")

        print("Classification Report")    
        print(classification_report(self.y_test, y_pred_MNB))
        
    def GaussianNB(self):
        GaussNB = GaussianNB()
        GaussNB.fit(self.X_train, self.y_train)

        print(f"Accuracy Score of Training Set:  {accuracy_score(self.y_train, GaussNB.predict(self.X_train))}")

        y_pred_GNB = GaussNB.predict(self.X_test)
        print(f"Accuracy Score of Test Set: {accuracy_score(self.y_test, y_pred_GNB)}")

        f1 = f1_score(self.y_test, y_pred_GNB, average='micro')
        print(f"F1 Score of Test Set: {f1}")

        print("Classification Report")    
        print(classification_report(self.y_test, y_pred_GNB))
        
    def BernoulliNB(self):
        BernNB = BernoulliNB()
        BernNB.fit(self.X_train, self.y_train)
        print(f"Accuracy Score of Training Set:  {accuracy_score(self.y_train, BernNB.predict(self.X_train))}")

        y_pred_BNB = BernNB.predict(self.X_test)
        print(f"Accuracy Score of Test Set: {accuracy_score(self.y_test, y_pred_BNB)}")

        f1 = f1_score(self.y_test, y_pred_BNB, average='micro')
        print(f"F1 Score of Test Set: {f1}")

        print("Classification Report")    
        print(classification_report(self.y_test, y_pred_BNB))