In [225]:
from mnist import MNIST
from scipy.stats import norm
import numpy as np
import pandas as pd
import math
from tqdm import tqdm

## Read train and test data

In [185]:
# http://yann.lecun.com/exdb/mnist/

mndata = MNIST('data/mnist_data_files')
mndata.gz=True
train_images, train_labels = mndata.load_training()

In [186]:
train_images = np.array(train_images)
train_labels = np.array(train_labels)

In [187]:
# 60000 rows 28*28 pixels
print(train_images.shape) 
print(train_labels.shape)

(60000, 784)
(60000,)


In [188]:
value_counts = pd.Series(train_labels).value_counts(normalize=True)
indices = np.asarray(value_counts.index.values)
value_counts = np.asarray(value_counts)
p_train_labels = np.vstack([indices, value_counts]).T


In [189]:
print(p_train_labels)

[[1.         0.11236667]
 [7.         0.10441667]
 [3.         0.10218333]
 [2.         0.0993    ]
 [9.         0.09915   ]
 [0.         0.09871667]
 [6.         0.09863333]
 [8.         0.09751667]
 [4.         0.09736667]
 [5.         0.09035   ]]


In [190]:
test_images, test_labels = mndata.load_testing()

In [191]:
test_images = np.array(test_images)
test_labels = np.array(test_labels)

In [201]:
print(test_images.shape)
print(test_labels[1:10])

(10000, 784)
[2 1 0 4 1 4 9 5 9]


## Naive Bayes - normal distribution

In [193]:
params = np.apply_along_axis(lambda x: norm.fit(x), 1, train_images.T)

In [229]:
def calculate_likelihood_for_each_label(p_label, feature_vec, params):
    means = params.T[0]
    variations = params.T[1]
    likelihood = np.nansum(norm.logpdf(feature_vec, means, np.sqrt(variations)))
    likelihood = likelihood + np.log(p_label[1])
    return np.array([p_label[0], likelihood])
    
def get_predict(likelihoods):
    max_row = [float("-inf"), float("-inf")]
    for likelihood in likelihoods:
        if(likelihood[1] > max_row[1]):
            max_row = likelihood
    return max_row[0]

### Evaluate

In [245]:
def predict(test_image):
    likelihoods = np.apply_along_axis(lambda label: calculate_likelihood_for_each_label(label, test_image, params), 1, p_train_labels)
    return get_predict(likelihoods)
    
predicts = np.apply_along_axis(lambda image: predict(image), 1, test_images)


In [246]:
predicts

array([1., 1., 1., ..., 1., 1., 1.])

In [247]:
def calculate_accuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [248]:
calculate_accuracy(test_labels, predicts)

0.1135