# Introduction

# Setup

## Imports

In [2]:
import math
import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris, load_breast_cancer, load_boston
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.base import BaseEstimator, ClassifierMixin

## Loading Dataset

In [133]:
dataset = np.array([
    [1,1,1],
    [0,0,1],
    [1,1,1],
    [1,1,1],
    [1,1,1],
    [0,0,0],
    [0,0,0],
    [1,1,0],
    [1,0,0],
    [0,0,0],
])

X = dataset[:,0:2]
y = dataset[:,2]

# Simple NaiveBayes

In [138]:
class CustomSimpleNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self._check_params(X, y)
        
        self.classes = list(set(y))
        self.xdim = X.shape[1]
        self._naive_bayes(X, y)
        
        self.fitted_ = True
        
        return self
    
    
    def predict(self, X):
        if self.fitted_ == None:
            raise Exception('"predict()" called before fit()')
        else:
            y_preds = []
            for x in X:
                y_pred = None
                probs = dict.fromkeys(self.classes, 1)
                
                for c in self.classes:
                    probs[c] = probs[c] * self.class_probs[c]
                    for i in range(self.xdim):
                        probs[c] = probs[c] * (self.feature_probs[i][c][x[i]])
                    
                y_pred = max(probs, key=(lambda k: probs[k]))
                y_preds.append(y_pred)
            return y_preds
        
    
    def _naive_bayes(self, X,y=None):
        df = pd.DataFrame(X)
        df['y'] = y
        
        #getting class probabilities
        class_probs = self._class_probs(pd.DataFrame.copy(df))
        
        #calculating conditional probabilities for each feature and each value of feature over each class
        feature_probs = {}
        for i in range(self.xdim):
            feature_probs[i] = self._feature_probs(df[[i, 'y']])
            
        self.class_probs = class_probs
        self.feature_probs = feature_probs
        
        
    def _feature_probs(self, df):
        values = list(set(df.iloc[:,0].values))
        probs = {}
        
        for c in self.classes:
            probs[c] = {}
            for value in values:
                probs[c][value] = df[df.y == c][df.iloc[:,0] == value].count().y / df[df.y == c].count().y
                
        return probs
            
    
    def _class_probs(self, df):
        probs = {}
        for c in self.classes:
            probs[c] = df[df.y == c].count().y / df.count().y
        return probs
    
    
    def _check_params(self, X, y=None):
        return

In [139]:
custModel = CustomSimpleNB()
custModel.fit(X, y)

accuracy_score(y_pred=custModel.predict(X), y_true=y)



0.80000000000000004

# GaussianNB

In [86]:
breast = load_breast_cancer()
iris = load_iris()

dataset = iris
X = dataset.data
y = dataset.target

xTrain, xTest, yTrain, yTest = train_test_split(X,y)

In [173]:
class CustomGaussianNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self._check_params(X, y)
        
        self.classes = list(set(y))
        self.xdim = X.shape[1]
        self._naive_bayes(X, y)
        
        self.fitted_ = True
        
        return self
    
    
    def predict(self, X):
        if self.fitted_ == None:
            raise Exception('"predict()" called before fit()')
        else:
            y_preds = []
            for x in X:
                y_pred = None
                probs = dict.fromkeys(self.classes, 1)
                
                for c in self.classes:
                    probs[c] = self.class_probs[c]
                    for i in range(self.xdim):
                        probs[c] = probs[c] * self._pdf(c, i, x[i])
                    
                y_pred = max(probs, key=(lambda k: probs[k]))
                y_preds.append(y_pred)
            return y_preds
        
    
    def _naive_bayes(self, X,y=None):
        df = pd.DataFrame(X)
        df['y'] = y
        
        #getting class probabilities
        class_probs = self._class_probs(pd.DataFrame.copy(df))
        
        #calculating means and standard deviations for each feature to calculate Gaussian PDF.
        ms = {}
        for c in self.classes:
            ms[c] = {}
            for i in range(self.xdim):
                ms[c][i] = {}
                ms[c][i]['mean'] = np.mean(df[df.y == c][i].values)
                ms[c][i]['std'] = np.std(df[df.y == c][i].values)
            
        self.class_probs = class_probs
        self.ms = ms
        self.df = df
            
    
    def _pdf(self, c, i, x):
        std = self.ms[c][i]['std']
        mean = self.ms[c][i]['mean']
        
        num = math.e ** (-((x-mean) ** 2) / (2 * (std ** 2)))
        den = math.sqrt(2 * math.pi) * std
        return num/den
    
    
    def _class_probs(self, df):
        probs = {}
        for c in self.classes:
            probs[c] = df[df.y == c].count().y / df.count().y
        return probs
    
    
    def _check_params(self, X, y=None):
        return

### CustomGaussianNB vs GaussianNB

In [179]:
skModel = GaussianNB()
custModel = CustomGaussianNB()

skModel.fit(xTrain, yTrain)
custModel.fit(xTrain, yTrain)

print(cross_val_score(estimator=skModel, cv=5, X=xTest, y=yTest))
print(cross_val_score(estimator=custModel, cv=5, X=xTest, y=yTest))

[ 0.88888889  1.          1.          0.85714286  1.        ]
[ 0.88888889  1.          1.          0.85714286  1.        ]


# MultinomialNB

In [111]:
class CustomMultinomialNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self._check_params(X, y)
        
        self.classes = list(set(y))
        self.xdim = X.shape[1]
        self._naive_bayes(X, y)
        
        self.fitted_ = True
        
        return self
    
    
    def predict(self, X):
        if self.fitted_ == None:
            raise Exception('"predict()" called before fit()')
        else:
            y_preds = []
            _ = 1
            for x in X:
                printl('{0}/{1}'.format(_, X.shape[0]))
                _ += 1
                y_pred = None
                probs = dict.fromkeys(self.classes, 1)
                
                for c in self.classes:
                    probs[c] = self.cprobs[0]
                    for i in range(self.xdim):
                        probs[c] = probs[c] * (self.fprobs[c][i] ** x[i])
                y_pred = max(probs, key=(lambda k: probs[k]))
                y_preds.append(y_pred)
            print()
            return y_preds
        
    
    def _naive_bayes(self, X,y=None):
        printl('constructing dataframe...')
        ds = np.hstack((X,y.reshape(-1,1)))
        self.cprobs, self.fprobs = self._probs(ds)
        
        
    def _probs(self, ds):
        cprobs = {}
        fprobs = {}
        
        for c in self.classes:
            printl('calculating class and feature probabilities for class {0}/{1}...'.format(c+1, len(self.classes)))
            temp = ds[ds[:,-1] == c]
            cprobs[c] = np.log(temp.shape[0] / ds.shape[0])
            fprobs[c] = np.log((temp.sum(axis=0) + 1) / (temp[:,:-1].sum() + ds.shape[1] - 1))
        print()
            
        return cprobs, fprobs
    
    
    def _check_params(self, X, y=None):
        return

In [109]:
breast = load_breast_cancer()
iris = load_iris()

dataset = iris
X = dataset.data
y = dataset.target

xTrain, xTest, yTrain, yTest = train_test_split(X,y)

### CustomMultinomialNB vs MultinomialNB

In [110]:
skModel = MultinomialNB().fit(xTrain, yTrain)
custModel = CustomMultinomialNB().fit(xTrain, yTrain)

print(cross_val_score(estimator=skModel, cv=5, X=xTest, y=yTest))
print(cross_val_score(estimator=custModel, cv=5, X=xTest, y=yTest))

[ 1.          0.75        0.625       0.57142857  0.66666667]
[ 1.          0.75        0.625       0.57142857  0.66666667]
