In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [6]:
class NaiveBayes:
    def __init__(self, distr='Gaussian'):
        self.distr=distr
        return
        
    def get_stats(self, x):
        mean = x.mean()
        sigma_variance = np.sqrt(np.var(x))
        return (mean, sigma_variance)
        

    def pdf(self, X, mean, std):
        if self.distr == 'Gaussian':
            prob = (1/(std * np.sqrt(2 * np.pi))) * np.exp((-1/2)*(((X - mean)/std)**2))
        return prob
    
    def fit(self, X_train, y_train):
        self.full_stats = dict()
        classes, counts = np.unique(y_train, return_counts=True)
        for i in range(len(classes)):
            class_i = dict()
            class_i['P_apriori'] = counts[i]/len(y_train)

            x_k = X_train[np.where(y_train == classes[i])]

            x_k_stats = []
            for j in range(x_k.shape[1]):
                x_k_m = x_k[:, j]
                x_k_stats.append(self.get_stats(x_k_m))
            class_i['stats'] = np.array(x_k_stats)

            self.full_stats[classes[i]] = class_i
        return
            
        
    def predict_proba(self, X):
        y_pred_all = []
        for k, value in self.full_stats.items():
            P_k = 1
            for i in range(len(value['stats'])):
                mean = value['stats'][i][0]
                sigma_var = value['stats'][i][1]
                P_k *= self.pdf(X[:, i], mean, sigma_var)
            y_pred_all.append(value['P_apriori'] * P_k)
        return np.array(y_pred_all)
       
                
    def predict(self, X):
        y_pred_all = self.predict_proba(X)
        y_pred = np.argmax(y_pred_all, axis=0)
        return y_pred
 
    def score(self, x_test, y_test):
        diff_vector = self.predict(x_test) - y_test
        accuracy = diff_vector[diff_vector==0].shape[0] / y_test.shape[0]
        return accuracy
       

In [7]:
X, y = load_iris(return_X_y=True)
x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=24)

In [8]:
print(x_train.shape)
print(np.unique(y_val, return_counts=True))

(112, 4)
(array([0, 1, 2]), array([12,  8, 18], dtype=int64))


In [9]:
nb=NaiveBayes()
nb.fit(x_train,y_train)
print('Train score:', nb.score(x_train, y_train))
print('Test score: ', nb.score(x_val, y_val))

Train score: 0.9642857142857143
Test score:  0.9473684210526315
