# Naive Bayes from scratch
# Milindi Shah
# J057


In [1]:
import pandas as pd 
from sklearn.datasets import load_iris 
import warnings
import numpy as np
warnings.simplefilter(action='ignore')

In [2]:
iris = load_iris()
X = iris['data']
y = iris['target']
cols = iris['feature_names']
classes = iris['target_names']

In [3]:
iris = pd.DataFrame(X, columns=cols)
iris['target'] = y
iris['target'] = iris['target'].map(dict(zip(iris['target'].unique(), classes)))

In [4]:
prob_classes = {}
for unique_class in classes:
    prob_classes[unique_class] = None
    for key in prob_classes.keys():
        prob_classes[key] = iris[iris['target'] == unique_class].shape[0]/iris.shape[0]

prob_classes

{'setosa': 0.3333333333333333,
 'versicolor': 0.3333333333333333,
 'virginica': 0.3333333333333333}

In [5]:
data = {}
for c in classes:
    data[c] = {}
    data[c]['mean'] = {}
    data[c]['std_dev'] = {}
    for col in iris.columns[:4]:
        data[c]['mean'][col] = iris[iris['target'] == c].mean()[col]
        data[c]['std_dev'][col] = iris[iris['target'] == c].std()[col]
data

{'setosa': {'mean': {'sepal length (cm)': 5.005999999999999,
   'sepal width (cm)': 3.428000000000001,
   'petal length (cm)': 1.4620000000000002,
   'petal width (cm)': 0.2459999999999999},
  'std_dev': {'sepal length (cm)': 0.35248968721345136,
   'sepal width (cm)': 0.3790643690962887,
   'petal length (cm)': 0.17366399648018407,
   'petal width (cm)': 0.10538558938004565}},
 'versicolor': {'mean': {'sepal length (cm)': 5.936,
   'sepal width (cm)': 2.7700000000000005,
   'petal length (cm)': 4.26,
   'petal width (cm)': 1.3259999999999998},
  'std_dev': {'sepal length (cm)': 0.5161711470638634,
   'sepal width (cm)': 0.3137983233784114,
   'petal length (cm)': 0.46991097723995795,
   'petal width (cm)': 0.19775268000454405}},
 'virginica': {'mean': {'sepal length (cm)': 6.587999999999998,
   'sepal width (cm)': 2.9739999999999998,
   'petal length (cm)': 5.552,
   'petal width (cm)': 2.026},
  'std_dev': {'sepal length (cm)': 0.6358795932744322,
   'sepal width (cm)': 0.32249663817

In [6]:
prediction = pd.DataFrame(columns=classes, index=range(0, iris.shape[0],1))

for c in classes:
    for idx, r in iris.iterrows():
        prob = prob_classes[c]
        for col in iris.columns[:4]:
            t1 = 1/(data[c]['std_dev'][col]*((2*np.pi)**0.5))
            a=((-((iris.loc[idx,col]-data[c]["mean"][col])**2)))
            b = (2*(data[c]["std_dev"][col]**2))
            t2 = np.exp(a/b)
            prob = prob*t1*t2
        prediction.loc[idx,c] = prob
prediction

Unnamed: 0,setosa,versicolor,virginica
0,2.791534,0.0,0.0
1,1.488164,0.0,0.0
2,1.163145,0.0,0.0
3,1.085765,0.0,0.0
4,2.656738,0.0,0.0
...,...,...,...
145,0.0,0.0,0.132245
146,0.0,0.001295,0.045437
147,0.0,0.000096,0.217838
148,0.0,0.0,0.055163


In [7]:
prediction['prediction'] = [0]*iris.shape[0]
for idx, r in prediction.iterrows():
    i = 0 
    for c in r[:-1]:
        if max(r[:-1]) == c:
            prediction.loc[idx, 'prediction'] = prediction.columns[i]
        i+= 1
prediction

Unnamed: 0,setosa,versicolor,virginica,prediction
0,2.791534,0.0,0.0,setosa
1,1.488164,0.0,0.0,setosa
2,1.163145,0.0,0.0,setosa
3,1.085765,0.0,0.0,setosa
4,2.656738,0.0,0.0,setosa
...,...,...,...,...
145,0.0,0.0,0.132245,virginica
146,0.0,0.001295,0.045437,virginica
147,0.0,0.000096,0.217838,virginica
148,0.0,0.0,0.055163,virginica


In [8]:
correct = 0
for i in range(iris.shape[0]):
    if prediction['prediction'][i] == iris['target'][i]:
        correct += 1
print(f'Accuracy -> {correct/iris.shape[0]:.3%}')

Accuracy -> 96.000%


In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
clf = GaussianNB()
clf.fit(iris.iloc[:,:-1], iris.iloc[:,-1])

predictions = clf.predict(iris.iloc[:,:-1])
print(f'Accuracy of scikit-learn -> {accuracy_score(iris.iloc[:,-1], predictions):.3%}')

Accuracy of scikit-learn -> 96.000%
