## Types of Naive Bayes

**Bernouli**

In [1]:
import numpy as np

In [2]:
from sklearn.datasets import make_classification
nb_samples = 300
X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0)

In [3]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
bnb = BernoulliNB(binarize=0.0)
bnb.fit(X_train, Y_train)
bnb.score(X_test, Y_test)

0.88

In [4]:
data = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
bnb.predict(data)

array([0, 1, 0, 1])

**Multinominal**

A multinomial distribution is useful to model feature vectors where each value represents, for example, the number of occurrences of a term or its relative frequency. If the feature vectors have n elements and each of them can assume k different values with probability pk.

In [5]:
from sklearn.feature_extraction import DictVectorizer
data = [
{'house': 100, 'street': 50, 'shop': 25, 'car': 100, 'tree': 20},

{'house': 5, 'street': 5, 'shop': 0, 'car': 10, 'tree': 500, 'river': 1}
] 

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(data)
Y = np.array([1, 0])

X

array([[100., 100.,   0.,  25.,  50.,  20.],
       [ 10.,   5.,   1.,   0.,   5., 500.]])

In [6]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
test_data = data = [
{'house': 80, 'street': 20, 'shop': 15, 'car': 70, 'tree': 10, 'river': 
1},

]
{'house': 10, 'street': 5, 'shop': 1, 'car': 8, 'tree': 300, 'river': 0} 
mnb.predict(dv.fit_transform(test_data))


array([1])

**Gaussian**

Gaussian Naive Bayes is useful when working with continuous values which probabilities can be modeled using a Gaussian distribution:

In [8]:
from sklearn.datasets import make_classification

nb_samples = 300
X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0)

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_gnb_score = gnb.predict_proba(X_test)
lr = LogisticRegression() 
lr.fit(X_train, Y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
Y_lr_score = lr.decision_function(X_test)
fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1])
fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score)

In [11]:
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

digits = load_digits()

gnb = GaussianNB()
mnb = MultinomialNB()

cross_val_score(gnb, digits.data, digits.target, scoring='accuracy', cv=10).mean()

0.8103537583567821

In [12]:
cross_val_score(mnb, digits.data, digits.target, scoring='accuracy', cv=10).mean()


0.8819396216300838