## Naive Bayes - Mushroom Dataset
Goal: Predict the type of mushrooms given some features, using Naive Bayes classification model

### Load Dataset

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../Datasets/Mushrooms/mushrooms.csv")

In [4]:
print(df.shape)
df.head()

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Encode Categorical data into Numerical data

In [6]:
le = LabelEncoder()

ds = df.apply(le.fit_transform)     # Supply function to apply tranformation to each column (or row, specify in axis)

In [9]:
print(type(ds))
ds.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [10]:
data = ds.values
print(type(data))
print(data.shape)
print(data[:5, :])     # Converted to a numerical numpy array now

<class 'numpy.ndarray'>
(8124, 23)
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [11]:
# Split dataset into X and Y arrays
data_y = data[:, 0]
data_x = data[:, 1:]

### Break dataset into train and test sets

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [23]:
print(np.unique(x_train[:,2]))
print(np.unique(y_train))

[0 1 2 3 4 5 6 7 8 9]
[0 1]


In [26]:
print(le.inverse_transform(y_train[:10]))    # To get back original values

['g' 'g' 'g' 'd' 'g' 'g' 'd' 'd' 'd' 'g']


### Naive Bayes Classifier

$ P(Y|X) = \frac{P(Y) P(X|Y)}{P(X)} = \frac{P(Y) \Pi_{i=1}^{n} P(x_i | y = c)}{P(X)} $ 

where, 
- P(Y) is prior probability
- P(X|Y) is conditional probability

In [36]:
label = 2
y = np.array([1,2,3,3,2,2,1,1,1,1])
print(np.sum(y == label)/len(y))
prior_prob(y, 2)

0.3


0.3

In [34]:
def prior_prob(y_train, label):
    class_examples = np.sum(y_train == label)
    total_examples = y_train.shape[0]
    return class_examples/(float)(total_examples)

In [38]:
def cond_prob(x_train, y_train, feature_col, feature_val, label):
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:, feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    return numerator/float(denominator)

In [45]:
def predict(x_train, y_train, xtest):  # x_test is a single testing point with n features  
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = []                    # List of probability for all classes, given a single testing point
    
    for c in classes:
        # Posterior of class c, Post_c = likelihood*prior, and likelihood = product across all features
        likelihood = 1.0
        
        for f in range(n_features):
            likelihood *= cond_prob(x_train, y_train, f, xtest[f], c)
        prior = prior_prob(y_train, c)
        post = prior*likelihood
        post_probs.append(post)

    pred = np.argmax(post_probs)
    return pred

In [50]:
def score(x_train, y_train, x_test, y_test):
    preds = []
    
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train, y_train, x_test[i])
        preds.append(pred_label)
        
    preds = np.array(preds)
    accuracy = np.sum(preds == y_test)/y_test.shape[0]
    return accuracy

In [51]:
output = predict(x_train, y_train, x_test[0])
print("Predicted:", output)
print("Actual:", y_test[0])

Predicted: 1
Actual: 1


In [52]:
print(score(x_train, y_train, x_test, y_test))

0.9975384615384615
