# Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r"C:\Users\prajwal\Downloads\mushrooms.csv")

In [3]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Properties of Mushroom are given

In [5]:
# Given different Properties of mushroom, we have to predict wheather musroom is for edible purpose or not

In [6]:
df.shape

(8124, 23)

In [7]:
lab_enc = LabelEncoder()
# Convert String to Integer.

In [8]:
# df.apply -> Tell what we need to do with one col, it will do that for every col.

In [9]:
# Label Encoder works one only 1col at a time

In [10]:
df = df.apply(lab_enc.fit_transform)

In [11]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [12]:
# Things are converted into Numbers.

In [13]:
data = df.values

In [14]:
data_X = data[:,1:] # kepping all rows and starting from 1st index col to the end.
data_Y = data[:,0]  # take all rows of 0th col.

In [15]:
split = 0.8

train_X = data_X[:int(split * data_X.shape[0]) , :]
train_Y = data_Y[:int(split * data_Y.shape[0])]
test_X = data_X[:int(split * data_X.shape[0]) : , :]
test_Y = data_Y[:int(split * data_Y.shape[0]) :]

In [16]:
print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)

(6499, 22) (6499,) (6499, 22) (6499,)


In [17]:
# Also we don't need to normalise the data.

In [18]:
# We need to calculate prior and likelihood func to find -> Posterior func

In [19]:
def prior_prob(Y_data, label):# P(Y=0) has two variables -> Y, 0
    return np.sum(Y_data == label)/Y_data.shape[0]

In [20]:
# Prior->

In [21]:
print(prior_prob(train_Y, 0))

0.5694722264963841


In [22]:
print(prior_prob(train_Y, 0) + prior_prob(train_Y, 1))

1.0


In [23]:
train_Y.shape

(6499,)

In [24]:
# Likelihood ->

In [25]:
def likelihood_prob(X_train, Y_train, feat_col, value, label):
    X_train = X_train[Y_train == label]  # Selecting those rows where [y == label]
    numerator = np.sum(X_train[:, feat_col] == value)   # All rows with given col = 0 or col = 1.
    denominator = np.sum(Y_train == label)
    return numerator / denominator

In [26]:
# Right Now define for 1 col, will use loop for implementing into every col.

In [27]:
# Posterior_prob

In [28]:
def posterior_prob(X_train, Y_train, X_test):
    classes = np.unique(Y_train)   # either 0 or 1.
    post_prob = []   # Initialising an empty array where we will be storing 0 and 1 probability.

    for label in classes:   # For calculating (P(Y=0)/X) and (P(Y=1)/X)
        likelihood = 1

        for i in range(X_train.shape[1]):  # Likelihod is broken down into multiple individual properties -> P(X=i/Y).
            # Posterior = likelihood*prior
            likelihood *= likelihood_prob(X_train, Y_train, i, X_test[i], label)   # X_test[i] -> selecting every word  
            
        prior = prior_prob(Y_train, label)  # get Prior
        post_prob.append(likelihood * prior)  # store the posterior prob
    return np.argmax(post_prob), post_prob    

In [29]:
print(posterior_prob(train_X, train_Y, test_X[0]), test_Y[0])

(1, [0.0, 7.941929449943349e-13]) 1


In [30]:
corr = 0

for i in range(test_X.shape[0]):
    pred, _ = posterior_prob(train_X, train_Y, test_X[i])
    if pred == test_Y[i]:
        corr += 1
print('Accuracy', corr/test_X.shape[0])        

Accuracy 0.9947684259116787
