# Chapter 6 - Other Popular Machine Learning Methods
## Segment 5 - Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
import urllib
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Naive Bayes
### Using Naive Bayes to predict spam

In [3]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

import urllib.request

raw_data = urllib.request.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=',')
dataset

array([[0.000e+00, 6.400e-01, 6.400e-01, ..., 6.100e+01, 2.780e+02,
        1.000e+00],
       [2.100e-01, 2.800e-01, 5.000e-01, ..., 1.010e+02, 1.028e+03,
        1.000e+00],
       [6.000e-02, 0.000e+00, 7.100e-01, ..., 4.850e+02, 2.259e+03,
        1.000e+00],
       ...,
       [3.000e-01, 0.000e+00, 3.000e-01, ..., 6.000e+00, 1.180e+02,
        0.000e+00],
       [9.600e-01, 0.000e+00, 0.000e+00, ..., 5.000e+00, 7.800e+01,
        0.000e+00],
       [0.000e+00, 0.000e+00, 6.500e-01, ..., 5.000e+00, 4.000e+01,
        0.000e+00]])

In [5]:
dataset[0]

array([  0.   ,   0.64 ,   0.64 ,   0.   ,   0.32 ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.64 ,   0.   ,   0.   ,
         0.   ,   0.32 ,   0.   ,   1.29 ,   1.93 ,   0.   ,   0.96 ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,   0.   ,
         0.   ,   0.   ,   0.778,   0.   ,   0.   ,   3.756,  61.   ,
       278.   ,   1.   ])

In [6]:
X = dataset[:,0:48]

y = dataset[:,-1]

In [7]:
X

array([[0.  , 0.64, 0.64, ..., 0.  , 0.  , 0.  ],
       [0.21, 0.28, 0.5 , ..., 0.  , 0.  , 0.  ],
       [0.06, 0.  , 0.71, ..., 0.06, 0.  , 0.  ],
       ...,
       [0.3 , 0.  , 0.3 , ..., 1.2 , 0.  , 0.  ],
       [0.96, 0.  , 0.  , ..., 0.32, 0.  , 0.  ],
       [0.  , 0.  , 0.65, ..., 0.65, 0.  , 0.  ]])

In [8]:
y

array([1., 1., 1., ..., 0., 0., 0.])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [10]:
bernoulli = BernoulliNB(binarize=True)
bernoulli.fit(X_train, y_train)

BernoulliNB(binarize=True)

In [11]:
y_pred = bernoulli.predict(X_test)
accuracy_score(y_test, y_pred)

0.8686210640608035

In [12]:
multinomial = MultinomialNB()
multinomial.fit(X_train, y_train)
y_pred = multinomial.predict(X_test)
accuracy_score(y_test, y_pred)

0.8957654723127035

In [13]:
guassian = GaussianNB()
guassian.fit(X_train, y_train)
y_pred = guassian.predict(X_test)
accuracy_score(y_test, y_pred)

0.8089033659066233

In [24]:
bernoulli = BernoulliNB(binarize=0.1)
bernoulli.fit(X_train, y_train)
y_pred = bernoulli.predict(X_test)
accuracy_score(y_test, y_pred)

0.9239956568946797