<a href="https://colab.research.google.com/github/preekshitsaklani/mushroom-nb/blob/main/Mushrooms_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## LIBRARIES

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

## Importing Drive

In [None]:
from google.colab import drive

## Mounting Drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Reading the dataset

In [None]:
df = pd.read_csv('/content/drive/My Drive/mushrooms.csv')

## Displaying the DataFrame

In [None]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


## Checking DataFrame shape

In [None]:
df.shape

(8124, 23)

## Initializing LabelEncoder

In [None]:
lab_enc = LabelEncoder()

## Applying Label Encoding

In [None]:
df = df.apply(lab_enc.fit_transform)

## Displaying encoded DataFrame

In [None]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


## Converting DataFrame to NumPy array

In [None]:
data = df.values

## Splitting data into features, target

In [None]:
data_X = data[:, 1:]
data_Y = data[:, 0]

## Splitting data into train, test

In [None]:
split = 0.8

train_X = data_X[:int(split * data_X.shape[0]), :]
train_Y = data_Y[:int(split * data_Y.shape[0])]
test_X = data_X[int(split * data_X.shape[0]) :, :]
test_Y = data_Y[int(split * data_X.shape[0]):]

## Printing shapes of train, test

In [None]:
print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)

(6499, 22) (6499,) (1625, 22) (1625,)


## Defining prior probability function

In [None]:
def prior_prob(Y_data, label):
    return np.sum(Y_data == label) / Y_data.shape[0]

## Printing sum of prior probabilities

In [None]:
print(prior_prob(train_Y, 0) + prior_prob(train_Y, 1))

1.0


## Printing number of training samples

In [None]:
train_Y.shape[0]

6499

## Defining likelihood probability function

In [None]:
def likelihood_prob(X_train, Y_train, feat_col, value, label):
    X_train = X_train[Y_train == label]
    numerator = np.sum(X_train[:, feat_col] == value)
    denominator = np.sum(Y_train == label)
    return numerator / denominator

## Defining posterior probability function

In [None]:
def posterior_prob(X_train, Y_train, X_test):
    classes = np.unique(Y_train)
    post_prob = []

    for label in classes:
        likelihood = 1

        for i in range(X_train.shape[1]):
            likelihood *= likelihood_prob(X_train, Y_train, i, X_test[i], label)

        prior = prior_prob(Y_train, label)
        post_prob.append(likelihood * prior)
    return np.argmax(post_prob), post_prob

## Printing prediction for test sample

In [None]:
print(posterior_prob(train_X, train_Y, test_X[1500])[0], test_Y[1500])

0 0


## Calculating and printing accuracy

In [None]:
corr = 0

for i in range(test_X.shape[0]):
    pred, _ = posterior_prob(train_X, train_Y, test_X[i])
    if pred == test_Y[i]:
        corr += 1
print('Accuracy:', corr/test_X.shape[0])

Accuracy: 0.940923076923077
