### Naive Bayes classifiers
- Gaussian Naive Bayes
- Multinomial Naive Bayes
- Bernoulli Naive Bayes
- Categorical Naive Bayes

In [28]:
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

#### Gaussian Naive Bayes
- continuous features

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris

In [2]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size=0.5, random_state=0)

In [8]:
Counter(y)

Counter({0: 50, 1: 50, 2: 50})

In [7]:
X_train.shape

(75, 4)

In [9]:
X_train

array([[4.6, 3.1, 1.5, 0.2],
       [5.9, 3. , 5.1, 1.8],
       [5.1, 2.5, 3. , 1.1],
       [4.6, 3.4, 1.4, 0.3],
       [6.2, 2.2, 4.5, 1.5],
       [7.2, 3.6, 6.1, 2.5],
       [5.7, 2.9, 4.2, 1.3],
       [4.8, 3. , 1.4, 0.1],
       [7.1, 3. , 5.9, 2.1],
       [6.9, 3.2, 5.7, 2.3],
       [6.5, 3. , 5.8, 2.2],
       [6.4, 2.8, 5.6, 2.1],
       [5.1, 3.8, 1.6, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [6.5, 3.2, 5.1, 2. ],
       [6.7, 3.3, 5.7, 2.1],
       [4.5, 2.3, 1.3, 0.3],
       [6.2, 3.4, 5.4, 2.3],
       [4.9, 3. , 1.4, 0.2],
       [5.7, 2.5, 5. , 2. ],
       [6.9, 3.1, 5.4, 2.1],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [7.2, 3. , 5.8, 1.6],
       [5.1, 3.5, 1.4, 0.3],
       [4.4, 3. , 1.3, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [5.5, 2.3, 4. , 1.3],
       [6.8, 3.2, 5.9, 2.3],
       [7.6, 3. , 6.6, 2.1],
       [5.1, 3.5, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [5.7, 2.8, 4.5, 1.3],
       [6.6, 3

In [3]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [4]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 75 points : 4


- Explore model attributes

In [23]:
# training data class distribution 
gnb.class_count_

array([29., 20., 26.])

In [29]:
np.round(gnb.class_prior_,3)

array([0.387, 0.267, 0.347])

In [30]:
gnb.n_features_in_

4

In [33]:
# variance of each feature per class
np.round(gnb.var_, 3)

array([[0.103, 0.132, 0.016, 0.008],
       [0.256, 0.083, 0.255, 0.046],
       [0.389, 0.101, 0.313, 0.048]])

In [34]:
# mean of each feature per class
np.round(gnb.theta_, 3)

array([[4.976, 3.359, 1.448, 0.234],
       [5.935, 2.71 , 4.185, 1.3  ],
       [6.777, 3.092, 5.735, 2.108]])

#### Multinomial Naive Bayes
- primarily used in text classification
- data are typically represented as count vector representation of text

In [53]:
from sklearn.naive_bayes import MultinomialNB

In [54]:
from sklearn.datasets import fetch_20newsgroups
news_train = fetch_20newsgroups(subset='train', 
                                categories=['alt.atheism','comp.graphics'], 
                                shuffle=True, random_state=42)

In [55]:
len(news_train.data)

1064

In [56]:
Counter(news_train.target)

Counter({0: 480, 1: 584})

In [57]:
# vectorize the training data
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', max_df=0.8, min_df=5)
X_train_vec = count_vect.fit_transform(news_train.data)
X_train_vec.shape

(1064, 4230)

In [58]:
# fit the classifier
# self-explore other parameter settings: alpha, class_prior
mnb = MultinomialNB(alpha=1.0, class_prior=None)  
mnb.fit(X_train_vec, news_train.target)

MultinomialNB()

In [59]:
# vectorize the test data
news_test = fetch_20newsgroups(subset='test', 
                               categories=['alt.atheism','comp.graphics'], 
                               shuffle=True, random_state=42)

X_test_vec = count_vect.transform(news_test.data)
X_test_vec.shape

(708, 4230)

In [60]:
# apply the fitted classifier to predict for test data
y_pred = mnb.predict(X_test_vec)

# model accuracy
np.round(mnb.score(X_test_bi, news_test.target), 3)

0.979

In [61]:
# check wrong predictions 
print("Number of mislabeled data out of a total %d data : %d"
      % (X_test_vec.shape[0], (news_test.target != y_pred).sum()))

Number of mislabeled data out of a total 708 data : 11


- Explore model attributes

In [62]:
# training data class distribution
mnb.class_count_

array([480., 584.])

In [63]:
# smoothed empirical log probability for each class
np.round(mnb.class_log_prior_,3)

array([-0.796, -0.6  ])

In [64]:
# Empirical log probability of features given a class, P(x_i|y).
np.round(mnb.feature_log_prob_,3)

array([[ -9.793,  -7.624,  -8.614, ..., -11.179, -10.486, -11.179],
       [ -7.53 ,  -8.195,  -8.141, ...,  -8.687,  -7.153,  -8.687]])

#### Bernoulli Naive Bayes
- Assumes each feature is a binary-valued variable

In [65]:
from sklearn.naive_bayes import BernoulliNB

In [66]:
# use the same dataset but with binary vectorization
bi_vect = CountVectorizer(stop_words='english', binary=True,
                             max_df=0.8, min_df=5)
X_train_bi = bi_vect.fit_transform(news_train.data)
X_train_bi.shape

(1064, 4230)

In [67]:
# vectorize the test data
X_test_bi = bi_vect.transform(news_test.data)
X_test_bi.shape

(708, 4230)

In [68]:
# build the classifier 
# self-explore other parameter settings
bnb = BernoulliNB(alpha=1.0, fit_prior=True, class_prior=None)

In [69]:
# train and test and the classifier
y_pred = bnb.fit(X_train_bi,news_train.target).predict(X_test_bi)
# model accuracy
np.round(bnb.score(X_test_bi, news_test.target), 3)

0.952

In [70]:
# check wrong predictions 
print("Number of mislabeled data out of a total %d data : %d"
      % (X_test_bi.shape[0], (news_test.target != y_pred).sum()))

Number of mislabeled data out of a total 708 data : 34


- Explore model attributes

In [71]:
# training data class distribution
bnb.class_count_

array([480., 584.])

In [73]:
# smoothed empirical log probability for each class
np.round(bnb.class_log_prior_, 3)

array([-0.796, -0.6  ])

In [74]:
# Empirical log probability of features given a class, P(x_i|y).
np.round(bnb.feature_log_prob_, 3)

array([[-4.792, -3.693, -3.613, ..., -6.178, -5.485, -6.178],
       [-3.329, -3.888, -3.975, ..., -3.975, -3.429, -3.975]])

#### Categorical Naive Bayes
- Assumes each feature has a categorical distribution 
- All features are encoded (OrdinalEncoder, OneHotEncoder)

In [75]:
from sklearn.naive_bayes import CategoricalNB

In [76]:
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])

In [77]:
X.shape
# 6 samples, each sample represented by 100 features
# each feature is encoded as a categorical variable in between [0,4]

(6, 100)

In [80]:
X

array([[3, 4, 0, 1, 3, 0, 0, 1, 4, 4, 1, 2, 4, 2, 4, 3, 4, 2, 4, 2, 4, 1,
        1, 0, 1, 1, 1, 1, 0, 4, 1, 0, 0, 3, 2, 1, 0, 3, 1, 1, 3, 4, 0, 1,
        3, 4, 2, 4, 0, 3, 1, 2, 0, 4, 1, 2, 2, 1, 0, 1, 3, 4, 3, 1, 3, 0,
        0, 2, 2, 1, 3, 4, 2, 0, 0, 1, 1, 3, 0, 0, 4, 2, 4, 3, 3, 0, 3, 4,
        3, 4, 4, 4, 1, 0, 4, 2, 0, 2, 4, 1],
       [1, 0, 2, 4, 4, 0, 4, 1, 4, 1, 0, 2, 3, 1, 2, 4, 4, 2, 2, 0, 1, 2,
        2, 0, 1, 2, 4, 0, 1, 2, 1, 4, 2, 0, 0, 1, 0, 1, 3, 1, 1, 4, 4, 3,
        0, 3, 0, 3, 1, 2, 4, 0, 0, 3, 1, 1, 0, 0, 4, 2, 3, 4, 2, 0, 3, 3,
        1, 2, 4, 3, 0, 0, 4, 2, 4, 2, 0, 3, 0, 0, 4, 2, 1, 0, 4, 3, 0, 1,
        2, 4, 4, 3, 3, 3, 3, 2, 3, 3, 4, 3],
       [2, 4, 4, 0, 3, 3, 0, 3, 1, 0, 2, 2, 2, 0, 2, 1, 4, 0, 4, 4, 1, 3,
        1, 4, 1, 2, 1, 0, 0, 2, 4, 1, 0, 0, 3, 1, 0, 4, 3, 2, 3, 4, 4, 3,
        0, 0, 0, 4, 1, 4, 1, 2, 2, 4, 3, 4, 4, 0, 3, 2, 4, 3, 4, 2, 3, 0,
        2, 1, 3, 2, 0, 1, 4, 1, 3, 3, 1, 2, 0, 2, 4, 0, 2, 4, 3, 4, 3, 0,
        4, 2, 2, 4, 1,

In [81]:
# build the classifier 
# self-explore different parameter settings
cnb = CategoricalNB(alpha=1.0)

In [83]:
# train and test and the classifier
y_pred = cnb.fit(X,y)

# model prediction for a specific sample
cnb.predict(X[2:3])

array([3])

- Explore model attributes

In [85]:
# training data class distribution
cnb.class_count_

array([1., 1., 1., 1., 1., 1.])

In [94]:
# (6 classes, 5 categories) for 100 features
# cnb.category_count_ # 100*6*5

cnb.category_count_[1] # category count for the first feature

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]])

In [95]:
# smoothed empirical log probability for each class
np.round(cnb.class_log_prior_, 3)

array([-1.792, -1.792, -1.792, -1.792, -1.792, -1.792])

In [101]:
# Empirical log probability of categories given the respective feature and class, P(x_i|y).
# corresponds to category_count_
np.round(cnb.feature_log_prob_[1], 3) # log prob for the first feature

array([[-1.792, -1.792, -1.792, -1.792, -1.099],
       [-1.099, -1.792, -1.792, -1.792, -1.792],
       [-1.792, -1.792, -1.792, -1.792, -1.099],
       [-1.792, -1.792, -1.099, -1.792, -1.792],
       [-1.792, -1.792, -1.792, -1.792, -1.099],
       [-1.792, -1.792, -1.792, -1.099, -1.792]])

In [103]:
# number of categories each feature has
cnb.n_categories_

array([4, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5,
       4, 5, 5, 5, 5, 3, 4, 5, 5, 5, 4, 4, 4, 5, 4, 5, 4, 5, 5, 5, 5, 5,
       4, 5, 3, 5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4,
       5, 4, 5, 4, 4, 5, 5, 5, 5, 4, 5, 4, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 4, 4, 5, 3, 4, 4, 5, 5])