# Naive Bayes on Categorical Data

In [1]:
import pandas as pd

import numpy as np

import scipy.stats as s

import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
data = pd.read_csv("mushrooms.csv")

In [3]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
training_data_len = int(0.7*data.shape[0])

cv_data_len = int(0.2*data.shape[0])

testing_data_len = int(0.1*data.shape[0])

In [5]:
poissonous_training_data = data[data['class'] == 'p'].iloc[0:training_data_len//2]

non_poissonous_training_data = data[data['class'] == 'e'].iloc[0:training_data_len//2]

training_data = pd.concat([poissonous_training_data, non_poissonous_training_data])

In [6]:
poissonous_remaining_data = data[data['class'] == 'p'].iloc[training_data_len//2:]

non_poissonous_remaining_data = data[data['class'] == 'e'].iloc[training_data_len//2:]

remaining_data = pd.concat([poissonous_remaining_data, non_poissonous_remaining_data])

remaining_data.shape

(2438, 23)

In [7]:
#Shuffling the remaining data

random_indices = np.random.choice(np.arange(0,2438), size = (2438,))

remaining_data = remaining_data.iloc[random_indices]

remaining_data = pd.DataFrame(data = remaining_data, columns =data.columns)

In [8]:
cv_data = remaining_data.iloc[0:cv_data_len]

testing_data = remaining_data.iloc[cv_data_len:]

In [9]:
data['cap-shape'].unique()

training_data[training_data['class'] == 'p']['cap-shape'].unique()

training_data[training_data['class'] == 'e']['cap-shape'].unique()

array(['x', 'b', 's', 'f'], dtype=object)

In [10]:
def lidstone_smoothing(column_name, category, alpha):
    
    D = dict()
    
    training_unique_feature_value = training_data[training_data['class'] == category][column_name].unique()
    
    data_unique_feature_value = data[column_name].unique()
    
    for feature_value in data_unique_feature_value:
        
        if feature_value not in training_unique_feature_value:
            
            D[feature_value] = alpha/(training_data[training_data['class'] == category].shape[0] + (training_unique_feature_value.shape[0]*alpha))
        
        else:
            
            D[feature_value] = np.count_nonzero(training_data[training_data['class'] == category][column_name] == feature_value)/(training_data[training_data['class'] == category].shape[0])
            
    return D

In [11]:
D_of_D = {'p':{},'e':{}}

for category in ['p','e']:
    
    for column_name in data.columns[1:]:
        
        D_of_D[category][column_name] = lidstone_smoothing(column_name,category,0.8)

In [12]:
D = dict()

In [13]:
D['e'] = lidstone_smoothing('cap-shape','e',1)

In [14]:
D['p'] = lidstone_smoothing('cap-shape','e',1)

In [15]:
D

{'e': {'x': 0.5666549419627155,
  'b': 0.09004572634540978,
  's': 0.011255715793176222,
  'f': 0.3320436158986986,
  'k': 0.00035124692658939234,
  'c': 0.00035124692658939234},
 'p': {'x': 0.5666549419627155,
  'b': 0.09004572634540978,
  's': 0.011255715793176222,
  'f': 0.3320436158986986,
  'k': 0.00035124692658939234,
  'c': 0.00035124692658939234}}

In [16]:
prior_p = training_data[training_data['class'] == 'p'].shape[0]/training_data.shape[0]

prior_e = training_data[training_data['class'] == 'e'].shape[0]/training_data.shape[0]

In [17]:
def cv_data_testing(data):
    
    likelihood_p = 1
    
    likelihood_e = 1
    
    for k,v in data.items():
    
        likelihood_p = likelihood_p * D_of_D['p'][k][v]
    
        likelihood_e = likelihood_e * D_of_D['e'][k][v]
    
    posterior_p = likelihood_p * prior_p
    
    posterior_e = likelihood_e * prior_e
    
    if posterior_p > posterior_e:
        
        return 'p'
    
    else:
        
        return 'e'

In [18]:
actual_results = np.array(cv_data['class'])

cv_data = np.array(cv_data.iloc[:,1:])

keys = data.columns[1:]

predicted_category = list()

for i in range(0,cv_data.shape[0]):
    
    data = dict(zip(keys,list(cv_data[i])))
    
    predicted_category.append(cv_data_testing(data))

In [19]:
predicted_category = np.array(predicted_category)

accuracy = np.count_nonzero(predicted_category == actual_results)/actual_results.shape[0]

accuracy

0.6422413793103449

In [20]:
actual_results = np.array(testing_data['class'])

testing_data = np.array(testing_data.iloc[:,1:])

data = pd.read_csv("mushrooms.csv")

keys = data.columns[1:]

predicted_category = list()

for i in range(0,testing_data.shape[0]):
    
    data = dict(zip(keys,list(testing_data[i])))
    
    predicted_category.append(cv_data_testing(data))

In [21]:
predicted_category = np.array(predicted_category)

accuracy = np.count_nonzero(predicted_category == actual_results)/actual_results.shape[0]

accuracy

0.6658476658476659