In [12]:
import numpy as np
import pandas as pd
import urllib.request as request

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB,MultinomialNB

In [25]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
req = request.urlopen(url)
dataset = np.loadtxt(req, delimiter=",")
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [50]:
X=dataset[:, 0:48]
y=dataset[:,-1]

In [38]:
print(X.shape)
print(y.shape)

(4601, 48)
(4601,)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

<b> Training a bernoulli NB before attempting to optimize <b>

In [115]:
# To use this model will need to convert all the data to binary
bernNB = BernoulliNB(binarize=0.1)
bernNB.fit(X_train, y_train)

BernoulliNB(binarize=0.1)

In [109]:
y_test = y_test.reshape(-1,1);

In [110]:
y_pred = bernNB.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9066232356134636


<b>## Finding the best fit model using Bernouli's using GridSearchCV </b>

In [75]:
## Finding the best fit model using Bernouli's
from sklearn.model_selection import GridSearchCV

In [121]:
bernNB_classifier = BernoulliNB()
params = {"binarize":np.arange(0, 1, 0.1)}
bernNB_grid = GridSearchCV(bernNB, param_grid=params, cv=10, return_train_score=False)
bernNB_grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=BernoulliNB(binarize=0.1),
             param_grid={'binarize': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])})

In [122]:
print("Binarise setting {}, Score:{}".format(bernNB_grid.best_params_, bernNB_grid.best_score_))

Binarise setting {'binarize': 0.30000000000000004}, Score:0.8907608695652174


<b style="color:red;"> This is worse than setting binarize to 0.1 see before optimization </b>

In [123]:
from sklearn.model_selection import RandomizedSearchCV
bernNB_classifierR = BernoulliNB()
bernNB_Random = RandomizedSearchCV(bernNB_classifierR, param_distributions=params, cv=10)
bernNB_Random.fit(X_train, y_train)
print("Binarise setting {}, Score:{}".format(bernNB_Random.best_params_, bernNB_Random.best_score_))

Binarise setting {'binarize': 0.30000000000000004}, Score:0.8907608695652174


In [71]:
multiNB = MultinomialNB()
multiNB.fit(X_train, y_train)
y_pred = multiNB.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8816503800217155


In [72]:
gaussNB = GaussianNB()
gaussNB.fit(X_train, y_train)
y_pred = gaussNB.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8197611292073833
