# Naive Bayes Classification using Gaussian Distribution

In [1]:
# Importing Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Reading Data
data = pd.read_csv('iris.csv')
print(data.shape)
data['species'] = pd.factorize(data['species'])[0] + 1 # converting categorical values to int
split = int(data.shape[0] * 0.7) # split size
data = data.values.tolist() # converting dataframe to list

np.random.shuffle(data) # shuffle data to make it random
train_data = data[:split] # train data
test_data = data[split:] # test data

(150, 5)


In [3]:
# Seperating Data
def separate(data):
    classes = dict()
    for i in range(len(data)):
        row = data[i]
        classVal = row[-1]
        if classVal not in classes:
            classes[classVal] = list()
        classes[classVal].append(row[:-1])
    return classes

In [4]:
# Calculate statistics
def cal(data):
    cal = [(np.mean(feature), np.std(feature), len(feature)) for feature in zip(*data)]
    return cal

In [5]:
# Fit the data
def fit(data):
    classes = separate(data)
    summaries = dict()
    for classVal, rows in classes.items():
        summaries[classVal] = cal(rows)
    return summaries

In [6]:
# Gaussian Distribution
def gaussianDist(x, mean, std):
    exponent = np.exp(-((x - mean) ** 2 / (2 * std ** 2 )))
    return 1 / (np.sqrt(2 * np.pi * std)) * exponent

In [7]:
# Calculate Posterior Probability
def cal_prob(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    prob = dict()
    for classVal, classData in summaries.items():
        prob[classVal] = summaries[classVal][0][2]/float(total_rows)
        for i in range(len(classData)):
            mean, std, count = classData[i]
            prob[classVal] *= gaussianDist(row[i], mean, std)
    return prob

In [8]:
# Predict the class for a given row
def predict(model, row):
    probabilities = cal_prob(model, row)
    best_label, best_prob = None, -1
    for classVal, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = classVal
    return best_label

In [9]:
# Accuracy Metric function
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [10]:
# Naive Bayes
model = fit(train_data)
predictions = []
for row in test_data:
    output = predict(model, row)
    predictions.append(output)

In [11]:
#pred = naive_bayes(train_data, test_data)
actual = [row[4] for row in test_data]
print("Accuracy :",accuracy_metric(actual,predictions))

Accuracy : 97.77777777777777


#### Iris-virginica : 1,  Iris-versicolor : 2,  Iris-setosa : 3

In [12]:
row = [5.7,2.9,4.2,1.3]
# predict the label
label = predict(model, row)
print('Data=%s, Prediction: %s' % (row, int(label)))

Data=[5.7, 2.9, 4.2, 1.3], Prediction: 2


# Using Scikit Learn

In [13]:
# load the iris dataset 
from sklearn.datasets import load_iris 
iris = load_iris() 
  
# store the feature matrix (X) and response vector (y) 
X = iris.data 
y = iris.target 
  
# split data into train and test sets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) 
  
# train the model on training set 
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(X_train, y_train) 
  
# make predictions on the testing set 
y_pred = gnb.predict(X_test) 
  
# comparing actual response values (y_test) with predicted response values (y_pred) 
from sklearn import metrics 
print("Gaussian Naive Bayes model accuracy :", metrics.accuracy_score(y_test, y_pred)*100)


Gaussian Naive Bayes model accuracy : 95.0
