## Naive Bayes

Today we will implement Naive Bayes algorithm on IRIS Dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()

In [3]:
type(iris)

sklearn.utils.Bunch

In [4]:
x = np.asarray(iris.data)
y = np.asarray(iris.target)

In [5]:
x.shape

(150, 4)

In [6]:
y.shape

(150,)

### Perform test/train split

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [8]:
print(x_train.shape,"   ",y_train.shape)
print(x_test.shape,"   ",y_test.shape)

(120, 4)     (120,)
(30, 4)     (30,)


In [9]:
def prior_prob(y_train,label):
    
    """This function calculates the prior probability of a label/class i.e. P(Y = label)"""
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label)
    
    return (class_examples)/float(total_examples)

In [10]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    
    return numerator/float(denominator)

In [11]:
def predict(x_train,y_train,x_test):
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    
    post_probs = []
    
    for label in classes:
        likelihood = 1.0
        
        for f in range(n_features):
            cond = cond_prob(x_train,y_train,f,x_test[f],label)
            likelihood *= cond
            
        prior = prior_prob(y_train,label)
        post = likelihood * prior
        
        post_probs.append(post)
        
    pred = np.argmax(post_probs)
    
    return pred

In [12]:
def score(X_train,X_test,Y_train,Y_test):
    
    m = X_test.shape[0]
    
    y_pred = []
    
    for i in range(m):
        pred = predict(X_train,Y_train,X_test[i])
        y_pred.append(pred)
    
    acc = accuracy_score(Y_test, y_pred)
    
    return acc

In [13]:
score(x_train,x_test,y_train,y_test)

0.9