In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, model_selection, naive_bayes, metrics

In [2]:
def normalize(a):
    return (np.max(a) - a) / (np.max(a) - np.min(a))

def accuracy(true, pred):
    return np.mean(true == pred) * 100

In [3]:
x, y = datasets.load_iris(return_X_y=True)
x_norm = np.apply_along_axis(normalize, 0, x)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_norm, y, test_size=0.3)

In [4]:
nb = naive_bayes.GaussianNB().fit(x_train, y_train)
y_train_pred = nb.predict(x_train)
y_test_pred = nb.predict(x_test)
print("Training accuracy", accuracy(y_train, y_train_pred))
print("Testing accuracy:", accuracy(y_test, y_test_pred))

Training accuracy 96.19047619047619
Testing accuracy: 95.55555555555556


In [8]:
unique, counts = np.unique(y_train, return_counts=True)
unique, counts

(array([0, 1, 2]), array([35, 35, 35], dtype=int64))

In [3]:
means = np.empty((x.shape[1], len(p)))
stds = np.empty_like(means)
data = np.hstack((x, y.reshape(len(y), 1)))
for i in range(x.shape[1]):
    for j in p:
        means[i][j] = np.mean(data[data[:, -1] == j][:, i])
        stds[i][j] = np.std(data[data[:, -1] == j][:, i])
print(means)
print(stds)

[[5.006 5.936 6.588]
 [3.428 2.77  2.974]
 [1.462 4.26  5.552]
 [0.246 1.326 2.026]]
[[0.34894699 0.51098337 0.62948868]
 [0.37525458 0.31064449 0.31925538]
 [0.17191859 0.46518813 0.54634787]
 [0.10432641 0.19576517 0.27188968]]


In [4]:
import math
n = len(x_test)
pred = np.empty(n)
for k in range(n):
    maxp = -1
    label = -1
    for j in p:
        prob = 1
        for i in range(x.shape[1]):
            prob *= 1 / np.sqrt(2 * np.pi * stds[i][j]) * math.exp(-((x_test[k][i] - means[i][j]) ** 2) / (2 * stds[i][j] ** 2))
        prob *= p[j] / len(x_test)
        if prob > maxp:
            maxp = prob
            label = j
    pred[k] = label
print(pred)
# print(metrics.accuracy_score(y_test, pred))
a = y_test - pred
print(1 - len(a[a != 0]) / len(a))

[0. 1. 1. 0. 2. 1. 2. 0. 0. 2. 1. 0. 2. 1. 1. 0. 1. 1. 0. 0. 1. 1. 2. 0.
 2. 1. 0. 0. 1. 2. 1. 2. 1. 2. 2. 0. 1. 0. 1. 2. 2. 0. 1. 2. 1.]
0.9555555555555556


In [7]:
# from sklearn.datasets import load_iris
# import numpy as np
# import random
# ds = load_iris()
# x = ds.data
# y = ds.target
import random
dataSet = np.hstack((x, y.reshape(len(y), 1)))
trainingSize = int(len(x) * 0.5)
randomChoices = random.choices(range(len(x)), k = trainingSize)
training = dataSet[randomChoices]
testing = np.delete(dataSet, randomChoices, 0)

x_training = training[:, :-1]
y_training = training[:, -1]
x_testing = testing[:, :-1]
y_testing = testing[:, -1]
print(x_training)
print(y_training)
print(x_testing)
print(y_testing)

[[5.7 3.  4.2 1.2]
 [5.6 3.  4.1 1.3]
 [6.1 2.6 5.6 1.4]
 [4.4 3.  1.3 0.2]
 [5.6 2.8 4.9 2. ]
 [5.5 2.3 4.  1.3]
 [7.9 3.8 6.4 2. ]
 [5.  3.2 1.2 0.2]
 [5.1 3.5 1.4 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.7 1.5 0.4]
 [6.4 2.8 5.6 2.2]
 [5.1 3.8 1.5 0.3]
 [6.2 2.2 4.5 1.5]
 [7.7 2.6 6.9 2.3]
 [5.  3.5 1.6 0.6]
 [6.4 2.8 5.6 2.1]
 [6.2 2.8 4.8 1.8]
 [6.3 2.5 5.  1.9]
 [7.  3.2 4.7 1.4]
 [6.  2.2 4.  1. ]
 [6.1 3.  4.6 1.4]
 [5.6 2.8 4.9 2. ]
 [5.1 3.4 1.5 0.2]
 [4.9 3.1 1.5 0.1]
 [6.1 3.  4.9 1.8]
 [6.  2.9 4.5 1.5]
 [5.7 2.8 4.5 1.3]
 [5.1 3.8 1.9 0.4]
 [4.5 2.3 1.3 0.3]
 [5.5 2.3 4.  1.3]
 [7.9 3.8 6.4 2. ]
 [5.7 2.5 5.  2. ]
 [7.7 2.8 6.7 2. ]
 [5.7 2.9 4.2 1.3]
 [6.7 3.1 5.6 2.4]
 [5.  3.5 1.3 0.3]
 [6.1 2.8 4.7 1.2]
 [5.7 2.9 4.2 1.3]
 [6.  2.7 5.1 1.6]
 [6.3 2.9 5.6 1.8]
 [6.9 3.2 5.7 2.3]
 [6.3 2.3 4.4 1.3]
 [6.3 2.5 4.9 1.5]
 [5.7 2.5 5.  2. ]
 [4.8 3.  1.4 0.3]
 [6.4 3.2 5.3 2.3]
 [6.3 2.8 5.1 1.5]
 [6.9 3.1 5.1 2.3]
 [6.2 2.8 4.8 1.8]
 [7.2 3.6 6.1 2.5]
 [5.5 2.3 4.  1.3]
 [5.5 3.5 1.