In [0]:
import os; os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/Unsupervised Learning')

In [0]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn
from util import getKaggleMNIST

In [0]:
class GaussianNB(object):
    def fit(self, X, Y, smoothing=1e-2):
        self.gaussian = dict()
        self.priors = dict()
        labels = set(Y)
        
        for c in labels:
            current_x = X[Y == c]
            self.gaussian[c] = {
                'mean': current_x.mean(axis=0),
                'var' : current_x.var(axis=0) + smoothing
            }
            self.priors[c] = float(len(Y[Y==c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussian)
        P = np.zeros((N, K))
        for c, g in self.gaussian.items():
            c = int(c)
            mean, var = g['mean'], g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

In [0]:
#get data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

In [28]:
#Naive Bayes without PCA
model1 = GaussianNB()
model1.fit(Xtrain, Ytrain)
print("NB train score:", model1.score(Xtrain, Ytrain))
print("NB test score:", model1.score(Xtest, Ytest))

NB train score: 0.8016101694915254
NB test score: 0.795


In [32]:
#Naive Bayes with PCA

#PCA
pca = PCA(n_components=50)
Ztrain = pca.fit_transform(Xtrain)
Ztest = pca.transform(Xtest)

#Train the PCA tranformed data
model2 = GaussianNB()
model2.fit(Ztrain, Ytrain)
print("NB train score:", model2.score(Ztrain, Ytrain))
print("NB test score:", model2.score(Ztest, Ytest))

NB train score: 0.8711694915254238
NB test score: 0.872
