https://hackernoon.com/implementation-of-gaussian-naive-bayes-in-python-from-scratch-c4ea64e3944d

https://peterroelants.github.io/posts/multivariate-normal-primer/

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from math import exp

In [None]:
class myGaussianClassifier(BaseEstimator, ClassifierMixin):
  def __init__(self,alpha=1e-5): # alpha: normalize sample covariance matrix
      if isinstance(self, myGaussianClassifier):
        super(myGaussianClassifier, self).__init__()
      self.alpha = alpha
  def fit(self, train, target):
    N, d = train.shape #N: 有幾筆資料
    self.label= np.sort(np.unique(target.ravel())) # [0, 1]
    self.c_ = self.label.size # 2
    self.d_ = d #feature = 30
    self.prior_ = np.zeros((self.c_,))
    self.mean_ = np.zeros((self.c_, self.d_))
    self.cov_ = np.zeros((self.c_, self.d_, self.d_))
    for cid, y in enumerate(self.label):
      idx = np.nonzero(target.ravel() == y)
      self.cov_[cid] = np.cov(train[idx], rowvar=False) + self.alpha*np.eye(d)
      self.mean_[cid] = np.mean(train[idx], axis = 0)
      self.prior_[cid] = np.sum(train[idx]) / (target.size)
    return self
  def predict(self, X, y=None):
    return self.label[np.argmax(self.predict_proba(X), 1)]
  def predict_proba(self, X, y=None): #each row of P(C|x) =>視需要
    total_prob=[] #class likelihood
    for x in X:
        prob = []
        x_m = np.zeros(self.d_)
        for i in range(0, self.c_):
            x_m = x - self.mean_[i]
            prob.append(1. / (np.sqrt((2 * np.pi)**self.d_ * np.linalg.det(self.cov_[i]))) * 
             np.exp(-(np.linalg.solve(self.cov_[i], x_m).T.dot(x_m)) / 2))
        total_prob.append(prob)
    return np.array(total_prob)*self.prior_ # class likelihood * prior => P(X|C) * P(C)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import neighbors, svm, naive_bayes
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.pipeline import Pipeline
import sklearn.datasets as ds
import numpy as np

# load data
data, target = ds.load_breast_cancer(True)
# classifier
gauss_clf = myGaussianClassifier()
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='kd_tree', leaf_size=30)
svm_clf = svm.SVC(kernel='linear', C=1, probability=True)
gaussNB_clf = naive_bayes.GaussianNB()

# define hyper-parm and its value
knn_clf_param = {'n_neighbors':[1, 3, 5, 7]}
svm_clf_param = {'C':[0.01, 0.1, 1, 10]}
gauss_clf_param = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100]}
gaussNB_clf_param = {'var_smoothing':np.logspace(-5,2,6)}

# inner cross-validation for hyper-param tuning
# n_jobs = -1 probably having bug in windows sys, change it into n_jobs=1
gauss_gs = GridSearchCV(estimator=gauss_clf, param_grid = gauss_clf_param, scoring = 'accuracy', cv = 5, n_jobs = 1, verbose = 1)
knn_gs = GridSearchCV(estimator = knn_clf, param_grid = knn_clf_param, scoring ='accuracy', cv = 5, n_jobs = 1, verbose = 1)
svm_gs = GridSearchCV(estimator = svm_clf, param_grid = svm_clf_param, scoring = 'accuracy', cv = 5, n_jobs= 1, verbose = 1)
svm_pipeline = Pipeline([('scaler', MinMaxScaler()), ('svm_gs', svm_gs)])
gaussNB_gs = GridSearchCV(estimator = gaussNB_clf, param_grid = gaussNB_clf_param, scoring = 'accuracy', cv= 5, n_jobs = 1, verbose = 1)

# outer cross-validation for estimating the accuracy of the clf
# the clf to be compared must be evaluated by the same k-fold CV
k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 3)

# n_jobs = -1 probably having bug in windows sys, change it into n_jobs=1
gauss_scores = cross_val_score(gauss_gs, data, target, scoring='accuracy', cv = k_fold, n_jobs = 20, verbose = 10)
knn_scores = cross_val_score(knn_gs, data, target, scoring = 'accuracy', cv = k_fold, verbose = 10)
svm_scores = cross_val_score(svm_pipeline, data, target, scoring = 'accuracy', cv = k_fold, verbose = 10)
gaussNB_scores = cross_val_score(gaussNB_gs, data, target, scoring = 'accuracy', cv = k_fold, verbose = 10)

NameError: ignored

In [None]:
print("----------------------GAUSS----------------------------")
print(np.mean(gauss_scores))
print("----------------------KNN----------------------------")
print(np.mean(knn_scores))
print("----------------------SVM----------------------------")
print(np.mean(svm_scores))
print("----------------------GAUSS_NB----------------------------")
print(np.mean(gaussNB_scores))

----------------------GAUSS----------------------------
0.9525689223057643
----------------------KNN----------------------------
0.9331453634085213
----------------------SVM----------------------------
0.982456140350877
----------------------GAUSS_NB----------------------------
0.9244047619047618


In [None]:
def test_pvalue(pvalue, cls1, cls2):
    print('------------------------')
    if pvalue<=0.05:
        print( 'the differrence in accuracy between ', cls1,' and ',cls2,' is significant')
    else:
        print( "the differrence in accuracy between",cls1," and ",cls2," is not significant")
    return "pvalue:",pvalue

In [None]:
# Max accuracy is "SVM Clasifier"
from scipy import stats
max_cls= "SVM Classifier"
t1, pvalue1 = stats.ttest_rel(gauss_scores, svm_scores)
print(test_pvalue(pvalue1, "Gauss Clasifier",max_cls))
t2, pvalue2 = stats.ttest_rel(knn_scores, svm_scores)
print(test_pvalue(pvalue2, "KNN Clasifier",max_cls))
t3, pvalue3 = stats.ttest_rel(gaussNB_scores, svm_scores)
print(test_pvalue(pvalue3, "GaussNB Clasifier",max_cls))

------------------------
the differrence in accuracy between  Gauss Clasifier  and  SVM Classifier  is significant
('pvalue:', 0.005723268993808907)
------------------------
the differrence in accuracy between  KNN Clasifier  and  SVM Classifier  is significant
('pvalue:', 0.009607662177057304)
------------------------
the differrence in accuracy between  GaussNB Clasifier  and  SVM Classifier  is significant
('pvalue:', 0.0018882146755463938)
