# ML Model: Naive Bayes

In [1]:
import numpy as np
import pandas as pd

## 1. Read datasets and put into pandas dataframe

In [2]:
dbf = pd.read_csv('diabetes_data_upload.csv')

In [3]:
dbf.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


### Convert categorical features to dummy variables

In [4]:
dbf_bin = pd.get_dummies(dbf,drop_first=True)

## 2. Training, Predicting Accuracy, and Cross Validation

In [5]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

# Split dataset into training set and test set
X = dbf_bin.drop('class_Positive',axis=1)
y = dbf_bin['class_Positive']

# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=109)

### 2a. Bernoulli Naive Bayes

In [6]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB(alpha=0.6)
bnb.fit(X_train, y_train)

bnb_pred = bnb.predict(X_test)
bnb_acc = metrics.accuracy_score(y_test, bnb_pred)

In [7]:
print("Bernoulli NB Accuracy:", bnb_acc)

Bernoulli NB Accuracy: 0.8589743589743589


#### Cross Validation: Bernoulli NB

In [8]:
predicted_bnb = cross_val_predict(bnb, X, y, cv=10)
cv_bnb = metrics.accuracy_score(y, predicted_bnb)
print('CV Accuracy Scores:', cv_bnb)

CV Accuracy Scores: 0.8711538461538462


### 2b. Categorical Naive Bayes

In [9]:
from sklearn.naive_bayes import CategoricalNB

cnb = CategoricalNB()
cnb.fit(X_train, y_train)

cnb_pred = cnb.predict(X_test)
cnb_acc = metrics.accuracy_score(y_test, cnb_pred)

In [10]:
print("Categorical NB Accuracy:", cnb_acc)

Categorical NB Accuracy: 0.8525641025641025


#### Cross Validation: Categorical NB

In [11]:
predicted_cnb = cross_val_predict(cnb, X, y, cv=10)
cv_cnb = metrics.accuracy_score(y, predicted_cnb)
print('CV Accuracy Scores:', cv_cnb)

CV Accuracy Scores: 0.8711538461538462


### 2c. Complement Naive Bayes

In [12]:
from sklearn.naive_bayes import ComplementNB

conb = ComplementNB(alpha=0.2)
conb.fit(X_train, y_train)

conb_pred = conb.predict(X_test)
conb_acc = metrics.accuracy_score(y_test, conb_pred)

In [13]:
print("Complement NB Accuracy:", conb_acc)

Complement NB Accuracy: 0.8782051282051282


#### Cross Validation: Complement NB

In [14]:
predicted_conb = cross_val_predict(conb, X, y, cv=10)
cv_conb = metrics.accuracy_score(y, predicted_conb)
print('CV Accuracy Scores:', cv_conb)

CV Accuracy Scores: 0.8980769230769231


### 2d. Gaussian Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

gnb_pred = gnb.predict(X_test)
gnb_acc = metrics.accuracy_score(y_test, gnb_pred)

In [16]:
print("Gaussian NB Accuracy:", gnb_acc)

Gaussian NB Accuracy: 0.8653846153846154


#### Cross Validation: Gaussian NB

In [17]:
predicted_gnb = cross_val_predict(gnb, X, y, cv=10)
cv_gnb = metrics.accuracy_score(y, predicted_gnb)
print('CV Accuracy Scores:  ', cv_gnb)

CV Accuracy Scores:   0.8807692307692307


### 2e. Multinomial Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train,y_train)
# print(clf.predict(X[2:3]))

mnb_pred = mnb.predict(X_test)
# print(metrics.classification_report(y_test, mnb_pred))
mnb_acc = metrics.accuracy_score(y_test, mnb_pred)
print("Multinomial NB Accuracy:", mnb_acc)

Multinomial NB Accuracy: 0.8717948717948718


#### Cross Validation: Multinomial NB

In [19]:
predicted_mnb = cross_val_predict(mnb, X, y,cv=10)
cv_mnb = metrics.accuracy_score(y, predicted_mnb)
print('Cross-validation Accuracy Scores: ', cv_mnb)

Cross-validation Accuracy Scores:  0.8865384615384615


## 3. Summary of Accuracy

In [20]:
import operator

# put all accuracy scores into dictionary
nb_all = {'Bernoulli':[bnb_acc,cv_bnb], 'Categorical':[cnb_acc,cv_cnb], 'Complement':[conb_acc,cv_conb], 'Gaussian':[gnb_acc,cv_gnb],'Multinomial':[mnb_acc,cv_mnb]}
nb_hi = max(nb_all.items(), key=operator.itemgetter(1))[0]

In [21]:
print('The accuracies of each Naive Bayes algorithms are as follows:\n')
print('Bernoulli NB:', round(bnb_acc*100,2),'% (CV', round(cv_bnb*100,2),'%)')
print('Categorical NB:', round(cnb_acc*100,2),'% (CV', round(cv_cnb*100,2),'%)')
print('Complement NB:', round(conb_acc*100,2),'% (CV', round(cv_conb*100,2),'%)')
print('Gaussian NB:', round(gnb_acc*100,2),'% (CV', round(cv_gnb*100,2),'%)')
print('Multinomial NB:', round(mnb_acc*100,2),'% (CV', round(cv_mnb*100,2),'%)')

The accuracies of each Naive Bayes algorithms are as follows:

Bernoulli NB: 85.9 % (CV 87.12 %)
Categorical NB: 85.26 % (CV 87.12 %)
Complement NB: 87.82 % (CV 89.81 %)
Gaussian NB: 86.54 % (CV 88.08 %)
Multinomial NB: 87.18 % (CV 88.65 %)


In [22]:
print('\nBased on the observation above, the Naive Bayes algorithm with highest accuracy is:\n')
print('{} NB with {}% accuracy and cross validation score of {}%'.format(nb_hi,round(nb_all[nb_hi][0]*100,2),round(nb_all[nb_hi][1]*100,2)))


Based on the observation above, the Naive Bayes algorithm with highest accuracy is:

Complement NB with 87.82% accuracy and cross validation score of 89.81%
