In [1]:
import pandas as pd
import numpy as np

In [2]:
## LOAD THE DTAA
data = pd.read_csv("mushrooms.csv")
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
print(data.shape)

(8124, 23)


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [34]:
X, y = data.drop(['class'], axis=1), data['class']
print(X.shape, y.shape)

(8124, 22) (8124,)


In [35]:
# Get numpy array
X = X.values
y = y.values

In [16]:
class NaiveBayes:

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes, count = np.unique(y, return_counts=True)
        self.priors = count/y.shape[0]

    def likelihood(self, x, cls):

        # compute product of P(xi|Y=cls)
        X = self.X[ self.y == cls ].copy()
        m = X.shape[0]
        p = 1
        for i in range(X.shape[1]):
            count = ( X[:,i] == x[i] ).sum()
            prob = count/m
            p *= prob
        return p

    def predict_helper(self, x):

        posteriors = []
        # compute posterior probabilty for each class
        for i, k in enumerate(self.classes):
            p = self.likelihood(x, k)*self.priors[i]
            posteriors.append(p)
        
        ix = np.argmax(posteriors)
        return self.classes[ix], posteriors[ix]/sum(posteriors) # pred_class, prob


    def predict(self, X):
        ypred = []
        for i in range(len(X)):
            pred = self.predict_helper(X[i])
            ypred.append(pred)
        return np.array(ypred) 
        

In [22]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(6499, 22) (6499,) (1625, 22) (1625,)


In [25]:
nb = NaiveBayes()
nb.fit(X_train, y_train)

print(nb.classes)
print(nb.priors)

['e' 'p']
[0.51638714 0.48361286]


In [26]:
pred = nb.predict(X_test)

In [20]:
pred[:2]

array([['p', '1.0'],
       ['e', '1.0']], dtype='<U32')

In [27]:
# Testing accuracy
(pred[:,0] == y_test).mean()

0.9956923076923077

In [None]:
## Predicting on training data
train_pred = nb.predict(X_train)

In [30]:
(train_pred[:,0] == y_train ).mean() # train accuracy

0.9973842129558393

In [31]:
np.unique(X[:,0])

array(['b', 'c', 'f', 'k', 's', 'x'], dtype=object)

In [33]:
count = ( X[:,0] == 'z' ).sum()
count

0