In [225]:
from mnist import MNIST
from scipy.stats import norm
import numpy as np
import pandas as pd
import math
from tqdm import tqdm

## Read train and test data

In [293]:
# http://yann.lecun.com/exdb/mnist/

mndata = MNIST('data/mnist_data_files')
mndata.gz=True
train_images, train_labels = mndata.load_training()

In [295]:
train_images = np.array(train_images)
train_labels = np.array(train_labels)

In [297]:
train_images = pd.DataFrame(train_images)
train_labels = pd.DataFrame(train_labels,columns=["label"])

In [298]:
# 60000 rows 28*28 pixels
print(train_images.shape) 
print(train_labels.shape)

(60000, 784)
(60000, 1)


In [299]:
p_train_labels = train_labels["label"].value_counts(normalize=True)

In [300]:
print(p_train_labels)

1    0.112367
7    0.104417
3    0.102183
2    0.099300
9    0.099150
0    0.098717
6    0.098633
8    0.097517
4    0.097367
5    0.090350
Name: label, dtype: float64


In [301]:
test_images, test_labels = mndata.load_testing()

In [302]:
test_images = np.array(test_images)
test_labels = np.array(test_labels)

In [303]:
test_images = pd.DataFrame(test_images)
test_labels = pd.DataFrame(test_labels,columns=["label"])

In [304]:
print(test_images.shape)
print(test_labels[1:10])

(10000, 784)
   label
1      2
2      1
3      0
4      4
5      1
6      4
7      9
8      5
9      9


## Naive Bayes - normal distribution

In [193]:
params_ = np.apply_along_axis(lambda x: norm.fit(x), 1, train_images.T)

In [266]:
def calculate_likelihood_for_each_label(p_label, feature_vec, params):
    means = params.T[0]
    stds = params.T[1]
    likelihood = np.nansum(norm.logpdf(feature_vec, means, stds))
    print(likelihood)
    likelihood = likelihood + np.log(p_label[1])
    return np.array([p_label[0], likelihood])
    
def get_predict(likelihoods):
    max_row = [float("-inf"), float("-inf")]
    for likelihood in likelihoods:
        if(likelihood[1] > max_row[1]):
            max_row = likelihood
    return max_row[0]

### Evaluate

In [267]:
def predict(image):
    likelihoods = np.apply_along_axis(lambda label: calculate_likelihood_for_each_label(label,image, params), 1, p_train_labels)
    return get_predict(likelihoods)
    
predicts = np.apply_along_axis(lambda image: predict(image), 1, test_images[:100])


-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-3133.4518432115715
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-4403.603236880467
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3029.359672445281
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3124.1014177160323
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3074.28658730994
-3033.3898119140076
-3033.3898119140076


-3070.937211550481
-3070.937211550481
-3070.937211550481
-3070.937211550481
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3384.155578457876
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3139.71141749602
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-3086.3879574306725
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-4412.108346876814
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606511471
-3084.746606

-3169.118125881509
-3169.118125881509
-3169.118125881509
-3169.118125881509
-3169.118125881509


In [255]:
predicts

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [247]:
def calculate_accuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [248]:
calculate_accuracy(test_labels, predicts)

0.1135