## Day 21 Naive Bayes
13/Jan/2022, Thursday

In [52]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB, BernoulliNB 

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
from sklearn.preprocessing import LabelEncoder

## Load Data

In [5]:
data = pd.read_csv("./mushrooms.csv")

In [6]:
data.shape

(8124, 23)

In [7]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Train test split

In [9]:
labels = data['class']
labels = labels.values

In [10]:
type(labels)

numpy.ndarray

In [12]:
X = data.drop(['class'], axis=1).values
print(X.shape)

(8124, 22)


In [13]:
type(X)

numpy.ndarray

In [14]:
X

array([['x', 's', 'n', ..., 'k', 's', 'u'],
       ['x', 's', 'y', ..., 'n', 'n', 'g'],
       ['b', 's', 'w', ..., 'n', 'n', 'm'],
       ...,
       ['f', 's', 'n', ..., 'b', 'c', 'l'],
       ['k', 'y', 'n', ..., 'w', 'v', 'l'],
       ['x', 's', 'n', ..., 'o', 'c', 'l']], dtype=object)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


## Label Encoding

In [20]:
def label_encoding(d):
    result = np.empty(d.shape)
    l_encoders = []
    for i in range(d.shape[1]):
        le = LabelEncoder()
        result[:,i] = le.fit_transform(d[:,i])
        l_encoders.append(le)
    return l_encoders, result

In [21]:
## Label encoding of training data
les, x_train = label_encoding(X_train)
print(x_train.shape)

(6499, 22)


In [23]:
print(np.unique(x_train[:,0]))
print(np.unique(X_train[:,0]))

[0. 1. 2. 3. 4. 5.]
['b' 'c' 'f' 'k' 's' 'x']


In [26]:
print(len(les))
for i, le in enumerate(les):
    print(i, le.classes_)

22
0 ['b' 'c' 'f' 'k' 's' 'x']
1 ['f' 'g' 's' 'y']
2 ['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']
3 ['f' 't']
4 ['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']
5 ['a' 'f']
6 ['c' 'w']
7 ['b' 'n']
8 ['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
9 ['e' 't']
10 ['?' 'b' 'c' 'e' 'r']
11 ['f' 'k' 's' 'y']
12 ['f' 'k' 's' 'y']
13 ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
14 ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
15 ['p']
16 ['n' 'o' 'w' 'y']
17 ['n' 'o' 't']
18 ['e' 'f' 'l' 'n' 'p']
19 ['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']
20 ['a' 'c' 'n' 's' 'v' 'y']
21 ['d' 'g' 'l' 'm' 'p' 'u' 'w']


In [27]:
## label encoding of X_test
x_test = np.empty(X_test.shape)
for i in range(X_test.shape[1]):
    le = les[i]
    x_test[:,i] = le.transform(X_test[:,i])    

In [28]:
x_test

array([[2., 0., 4., ..., 3., 3., 1.],
       [2., 2., 2., ..., 7., 4., 2.],
       [5., 3., 4., ..., 7., 4., 2.],
       ...,
       [5., 3., 4., ..., 7., 4., 4.],
       [3., 3., 4., ..., 7., 4., 4.],
       [5., 0., 9., ..., 1., 5., 4.]])

In [38]:
np.unique(labels)
map_ = {'e':0, 'p':1}
Y_train = np.array([map_[el] for el in y_train])

In [39]:
Y_train.shape

(6499,)

In [40]:
Y_test = np.array([map_[el] for el in y_test])
print(Y_test.shape)

(1625,)


In [41]:
print(np.unique(Y_test))
print(np.unique(y_test))

[0 1]
['e' 'p']


## Build the model

In [42]:
nb = CategoricalNB()

In [43]:
# train 
nb.fit(x_train, Y_train)

CategoricalNB()

In [47]:
nb.classes_

array([0, 1])

In [48]:
nb.class_log_prior_

array([-0.65822036, -0.72933817])

In [50]:
test_pred = nb.predict(x_test)
test_acc = nb.score(x_test, Y_test)
print("Test accuracy", test_acc)

Test accuracy 0.9507692307692308


In [51]:
train_acc = nb.score(x_train, Y_train)
print("Training Accuracy", train_acc)

Training Accuracy 0.955377750423142


## HW
## Implement Naive Bayes

In [None]:
class NaiveBayes:
    
    def __init__(self,):
        pass
    
    # training
    def fit(self, X, y):
        # compute priors of each class
        self.classes = [] 
        self.class_priors = []
        
        # compute likelihood, P(xi|y=c) for each feature
        self.feature_prob = []
        
    def predict(self, X):
        ## prediction for 1 sample
        # compute posterior prob for each class, P(y=c|X)
        post = []
        for c in self.classes:
            post_prob_c = 0
            prior = self.priors[c]
            likelihood = 1
            for i in range(X.shape[1]):
                likelihood *= self.feature_prob[i, c, X[i]]
            pass