# NAIVE BAYES - MUSHROOM DATASET
Goal is to predict the class of mushroom , given some features of the mushroom.We will use 
the naive bayes model for this classification

## Load the dataset

In [1]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv("./Mushrooms/mushrooms.csv")
print(df.shape)
df.head()

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


#### Here the data given is categorical in nature but we want it to numerical , so we wll use sklearn to process into numerical data

In [8]:
from sklearn.preprocessing import LabelEncoder

In [15]:
#make oject of LabelEncoder class
le = LabelEncoder()
# df.apply?
ds = df.apply(le.fit_transform)
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


#### Convert DF into a Numpy array

In [17]:
data = ds.values
print(type(data))

<class 'numpy.ndarray'>


## Break the data into TRAIN and TEST

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
data_x = data[:,1:]
data_y = data[:,0]

In [31]:
x_train,x_test,y_train,y_test =train_test_split(data_x,data_y,test_size=0.2)

In [32]:
print(x_train.shape)
print(y_train.shape)

(6499, 22)
(6499,)


In [34]:
#number of types of mushroom 
#one belong to class 1 and other to class 0
np.unique(y_train)

array([0, 1])

# Building our Classifier!!

In [37]:
# lets build our function to calculate prior probabilty
def prior(y_train,label):
    #prior probabilty is p(y_test=label)
    
    #count no.of times given label occurs in the all labels
    no_times = np.sum(y_train==label)
    
    #count all lables
    total_times = y_train.shape[0]
    
    #return no_of_times_label_occur/total_lables
    return no_times/float(total_times)

In [53]:
#likelihood is calculated by multiplying conditional probability p(xi/y=label)
# this can be read as - given a label , find how many times a feature x occurs in that particular label

def conditional(x_train,y_train,feature_col,feature_value,label):
    # feature_value represents green,yelow etc
    
    #sabse pehle x_train mein se apne waale label(0,1 in this example)
    #ka count nikal lo
    x_filtered = x_train[y_train==label]
    #ab x_filtered mein se feature count kro
    # matlab apne waale label mein passed feature kitni baar aya
    numerator = np.sum(x_filtered[:,feature_col]==feature_value)
    
    #calculate count in which our label occurs
    denominator = x_filtered.shape[0]
    
    return numerator/denominator

## Now compute posterior prob (likelihood * prior) for each test example and make prediction

In [56]:
def predict(x_train,y_train,x_test):
    
    """x test is a single testing point , n features"""
    
    #count no. of unique labels i.e types of mushroom
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    
    #list to store posterior prob of each class
    post_probs = []
    
    for label in classes:
        #likelihhod is calculated bu multiplying conditional prob for each feature
        likelihood = 1.0
        
        for f in range(n_features):
            #xtest[f] matlab test point ka feature
            # 5th param label is one othe label from available classes
            cond = conditional(x_train,y_train,f,x_test[f],label)
            likelihood*=cond
        #prior loop se baahr hai kyuki yeh ek baar multiply hogi likelihood se
        # likelihood loop ke undar hai kyuki woh conditional prob. se product se banta hai
        prior_ = prior(y_train,label)
        posterior = prior_*likelihood
        
        post_probs.append(posterior)
    
    return np.argmax(post_probs)

##### Lets test our classifier

In [60]:
output = predict(x_train,y_train,x_test[1])
#our classifier output
print(output)
#value in dataset
print(y_test[1])

0
0
