In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
#load the data and convert into a pandas DataFrame
dataset=load_iris()
features=dataset.data
target=dataset.target
sepal_length=features[:,0]
sepal_width=features[:,1]
petal_length=features[:,2]
petal_width=features[:,3]
target_variable=target[:]
#make a dictionary with features
features={"sepal length (cm)":sepal_length,
                     "sepal width (cm)":sepal_width,
                     "petal length (cm)":petal_length,
                     "petal width (cm)":petal_width}
#convert the dictionary to a DataFrame
df=pd.DataFrame(features)

In [3]:
#normalise the feature values
X=df.values
sc=StandardScaler()
normalised=sc.fit_transform(X)
df=pd.DataFrame(data=normalised,columns=dataset.feature_names)
#adding the target class to the dataframe
df["target class"]=target_variable

In [4]:
#split the data for training and testing purpose
x=normalised
y=target_variable
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=40,random_state=42)

In [5]:
xtrain.shape,ytrain.shape,xtest.shape,ytest.shape

((110, 4), (110,), (40, 4), (40,))

*Implementation of Naive-Bayes from scratch*
Algorithm:
1. Find the posterior probability of given features with each of the classes.
        P(y=1|X) is proportional to P(X|y=1)*P(y)
        where, P(y=1|X) = posterior probability
        P(y) = prior probability
        P(X|y=1) = likelihood or class conditional probability
2. Take the argmax of all posterior probabilities that will give the index of target classes

In [6]:
class My_Naive_Bayes:
    def __init__(self):
        pass
    def fit(self,x,y):
        self.x=x
        self.y=y
    def calculate_prior_probability(self,label):
        """Inputs:
           label:The target variables"""
        total_examples=self.y.shape[0]
        label_examples=np.sum(self.y == label)
        return label_examples/total_examples
    
    def class_conditional_probability(self,feature_column,feature_value,label):
        """if feature column=petal length,feature value=5 and label=1
        this function returns the probability of feature value in feature column when the label is 1"""
        #filter those in x with y==label
        xfiltered=self.x[self.y==label]
        numerator=np.sum(xfiltered[:,feature_column]==feature_value)
        denominator=len(xfiltered)
        return numerator/denominator
    
    def predict_single_example(self,xtest):
        """Inputs:
        xtest: single example with n features"""
        unique_classes=np.unique(self.y)
        n_features=self.x.shape[1]
        posterior_probabilities=[]
        #calculate post.prob for each classs
        #post.prob = prior * likelihood
        for label in unique_classes:
            likelihood=1
            for feature in range(n_features):
                conditional=self.class_conditional_probability(feature,xtest[feature],label)
                likelihood *= conditional
            prior=self.calculate_prior_probability(label)
            posterior_probabilities.append(prior*likelihood)
        return np.argmax(posterior_probabilities)
    
    def predict(self,nd_array):
        result=[]
        for point in nd_array:
            result.append(self.predict_single_example(point))
        return np.array(result)

*Predictions using Naive Bayes from scratch:*

In [7]:
my_model=My_Naive_Bayes()
my_model.fit(xtrain,ytrain)
pred=my_model.predict(xtest)
print(accuracy_score(ytest,pred))

0.875


*Predictions using sklearn's Naive Bayes:* 

In [8]:
from sklearn.naive_bayes import GaussianNB
sklearn_model=GaussianNB()
predicted=sklearn_model.fit(xtrain,ytrain).predict(xtest)
print(accuracy_score(ytest,predicted))

1.0
