# Naive Bayes - Mushroom Classification
- Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model for this classification.

## Load Dataset

In [1]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./mushrooms.csv')
df.head(n=10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [3]:
df.shape

(8124, 23)

## Encode Categorical Data into numeric Data

In [7]:
le=LabelEncoder()
ds=df.apply(le.fit_transform)
ds.head()
# print(type(ds))

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [25]:
#coverting dataframe to numpy 
data = ds.values
# print(data.shape)
# print(type(data))
# print(data[:5,:])
data_y = data[:,0]
data_x = data[:,1:]

## Break inti train and test

In [26]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.2)

In [27]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((6499, 22), (6499,), (1625, 22), (1625,))

In [28]:
np.unique(y_train)

array([0, 1])

# building our classifier

In [29]:
def prior_prob(y_train,label):
    total_examples=y_train.shape[0]
    label_match=np.sum(y_train==label)
    return (label_match)/float(total_examples)

In [30]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    # feature col ke andar which val green red or brown mushroom 
    x_filtered = x_train[y_train==label]# taking only those rows
    numerator = np.sum(x_filtered[:,feature_col]==feature_val)
    denominator = np.sum(y_train==label)
    return numerator/float(denominator)

In [31]:
def predict(x_train,y_train,xtest):
    '''xtest is a single testing point '''
    classes=np.unique(y_train)
    no_of_features=x_train.shape[1]
    post_probs=[]
    for label in classes:
        likelihood=1.0
        for f in range(no_of_features):
            likelihood*=cond_prob(x_train,y_train,f,xtest[f],label)
        prior=prior_prob(y_train,label)
        post=likelihood*prior
        post_probs.append(post)
    pred=np.argmax(post_probs)
    return pred

In [36]:
output = predict(x_train,y_train,x_test[3])
print(output)
print(y_test[3])

1
1


In [43]:
def score(x_train,y_train,x_test,y_test):
    n=x_test.shape[0]
    preds=[]
    for i in range(n):
        predicition=predict(x_train,y_train,x_test[i])
        preds.append(prediction)
    accuracy=np.sum(preds==y_test)
    return accuracy
        

In [44]:
print(score(x_train,y_train,x_test,y_test))

NameError: name 'prediction' is not defined