## Naive Bayes :- Mushroom Classification

Goal is to predict the class of the mushrooms, given some features of the mushrooms. We will use naive bayes model for this classification.



### Load the dataset

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('mushrooms.csv')

In [11]:
print(df.head(n=5))

  class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0     p         x           s         n       t    p               f   
1     e         x           s         y       t    a               f   
2     e         b           s         w       t    l               f   
3     p         x           y         w       t    p               f   
4     e         x           s         g       f    n               f   

  gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0            c         n          k  ...                        s   
1            c         b          k  ...                        s   
2            c         b          n  ...                        s   
3            c         n          n  ...                        s   
4            w         b          k  ...                        s   

  stalk-color-above-ring stalk-color-below-ring veil-type veil-color  \
0                      w                      w         p          w   
1       

## Encode the categorical data into numeric data

In [12]:
le = LabelEncoder()
# Applies transformation on each coloumn
ds = df.apply(le.fit_transform)

In [13]:
print(type(ds))

<class 'pandas.core.frame.DataFrame'>


In [15]:
print(ds.head(n=5))

   class  cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0      1          5            2          4        1     6                1   
1      0          5            2          9        1     0                1   
2      0          0            2          8        1     3                1   
3      1          5            3          8        1     6                1   
4      0          5            2          3        0     5                1   

   gill-spacing  gill-size  gill-color  ...  stalk-surface-below-ring  \
0             0          1           4  ...                         2   
1             0          0           4  ...                         2   
2             0          0           5  ...                         2   
3             0          1           5  ...                         2   
4             1          0           4  ...                         2   

   stalk-color-above-ring  stalk-color-below-ring  veil-type  veil-color  \
0         

In [19]:
data = ds.values
print(data.shape)
print(type(data))
print(data[:5,:])


data_y = data[:,0]  # the first coloumn is the type of mushrooms(class of the mushrooms)
data_x = data[:,1:]  # all the columns except the first column are the features of the mushrooms.

(8124, 23)
<class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


## Break the data into test and train

In [20]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [21]:
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

(6499, 22) (1625, 22)
(6499,) (1625,)


In [23]:
np.unique(y_train) # this gives the type of mushrooms present in th training data sets.

array([0, 1])

## Building Classifier

In [24]:
def prior_prob(y_train,label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    return (class_examples)/float(total_examples)

In [25]:
def conditional_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered = x_train[y_train==label]
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    return numerator/float(denominator)

## Computing posterior probability for each test example and make predictions

In [28]:
def predict(x_train,y_train,x_test):
    "x_test is a single testing point with n features."
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probability = [] # List of prob for all classes and given a single testing point
    # compute posterior prob for each classes.
    for label in  classes:
        #post_c = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            cond = conditional_prob(x_train,y_train,f,x_test[f],label)
            likelihood *= cond
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        post_probability.append(post)
    pred = np.argmax(post_probability)
    return pred

## Predicting the class of the mushrooms

In [31]:
output = predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1


## Calculating the accuracy

In [32]:
def score(x_train,y_train,x_test,y_test):
    pred = []
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
    pred = np.array(pred)
    accuracy = np.sum(pred == y_test)/y_test.shape[0]
    return accuracy

In [34]:
print(score(x_train,y_train,x_test,y_test))

0.9956923076923077
