# Naive Bayes - Mushroom Dataset
- Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Classification for this classification

# Load the dataset

In [1]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('mushrooms.csv')
print(df.head())

  type cap_shape cap_surface cap_color bruises odor gill_attachment  \
0    p         x           s         n       t    p               f   
1    e         x           s         y       t    a               f   
2    e         b           s         w       t    l               f   
3    p         x           y         w       t    p               f   
4    e         x           s         g       f    n               f   

  gill_spacing gill_size gill_color  ... stalk_surface_below_ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk_color_above_ring stalk_color_below_ring veil_type veil_color  \
0                      w                      w         p          w   
1             

In [4]:
print(df.shape)

(8124, 23)


# Encode the Categorical data into numerical data

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
le=LabelEncoder()
df=df.apply(le.fit_transform)

In [9]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [14]:
print(df.head())

   type  cap_shape  cap_surface  cap_color  bruises  odor  gill_attachment  \
0     1          5            2          4        1     6                1   
1     0          5            2          9        1     0                1   
2     0          0            2          8        1     3                1   
3     1          5            3          8        1     6                1   
4     0          5            2          3        0     5                1   

   gill_spacing  gill_size  gill_color  ...  stalk_surface_below_ring  \
0             0          1           4  ...                         2   
1             0          0           4  ...                         2   
2             0          0           5  ...                         2   
3             0          1           5  ...                         2   
4             1          0           4  ...                         2   

   stalk_color_above_ring  stalk_color_below_ring  veil_type  veil_color  \
0               

In [15]:
data=df.values
print(type(data))
print(data.shape)

<class 'numpy.ndarray'>
(8124, 23)


In [16]:
data_x=data[:,1:]
data_y=data[:,0]

In [17]:
x_train,x_test,y_train,y_test=train_test_split(data_x,data_y,test_size=0.2)

In [18]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [20]:
print(np.unique(y_train))

[0 1]


# Building our classifier

In [35]:
def prior_probability(Y_train,label):
    total_examples=Y_train.shape[0]
    class_examples=np.sum(Y_train==label)
    return class_examples/float(total_examples)
def cond_probab(X_train,Y_train,featured_col,featured_val,label):
    filtered_X=X_train[Y_train==label]
    numerator=np.sum(filtered_X[:,featured_col]==featured_val)
    denominator=np.sum(Y_train==label)
    return numerator/float(denominator)
def predict(x_train,y_train,x_test):
    n_features=x_train.shape[1]
    n_classes=np.unique(y_train)
    post_probabs=[]
    for labels in n_classes:
        liklihood=1.0
        for f in range(n_features):
            cond=cond_probab(x_train,y_train,f,x_test[f],labels)
            liklihood*=cond
        prior=prior_probability(y_train,labels)
        posterior=prior*liklihood
        post_probabs.append(posterior)
    pred=np.argmax(post_probabs)
    return pred   

In [42]:
output=predict(x_train,y_train,x_test[15])
print(output)
print(y_test[15])

1
1


In [44]:
def score(x_train,y_train,x_test,y_test):
    pred=[]
    for i in range(x_test.shape[0]):
        pred_labels=predict(x_train,y_train,x_test[i])
        pred.append(pred_labels)
    pred=np.array(pred)
    accuracy=np.sum(pred==y_test)/y_test.shape[0]
    return accuracy

In [45]:
print(score(x_train,y_train,x_test,y_test))

0.9963076923076923
