In [14]:
from mnist import MNIST
from scipy.stats import norm
import numpy as np
import pandas as pd
import math
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm
tqdm.pandas()

In [15]:
%matplotlib inline

## Read train and test data

In [16]:
# http://yann.lecun.com/exdb/mnist/

mndata = MNIST('data/mnist_data_files')
mndata.gz=True
train_images, train_labels = mndata.load_training()

### process data and threashold

In [17]:
train_images = np.array(train_images)
train_labels = np.array(train_labels)

In [18]:
train_images = (pd.DataFrame(train_images) > 127).astype(np.int).replace(1,255)
train_labels = pd.DataFrame(train_labels,columns=["label"])

In [19]:
# 60000 rows 28*28 pixels
print(train_images.shape) 
print(train_labels.shape)

(60000, 784)
(60000, 1)


In [20]:
value_counts = train_labels["label"].value_counts(normalize=True)
p_train_labels = pd.DataFrame()
p_train_labels['label'] = value_counts.index
p_train_labels['probability'] = value_counts.values

In [21]:
p_train_labels.head(10)

Unnamed: 0,label,probability
0,1,0.112367
1,7,0.104417
2,3,0.102183
3,2,0.0993
4,9,0.09915
5,0,0.098717
6,6,0.098633
7,8,0.097517
8,4,0.097367
9,5,0.09035


In [22]:
test_images, test_labels = mndata.load_testing()

In [23]:
test_images = np.array(test_images)
test_labels = np.array(test_labels)

In [24]:
test_images = (pd.DataFrame(test_images) > 127).astype(np.int).replace(1,255)
test_labels = pd.DataFrame(test_labels,columns=["label"])

In [25]:
print(test_images.shape)

(10000, 784)


In [26]:
def get_params(label_group):
    images_df = label_group.drop(['label'], axis=1)
    return images_df.apply(lambda x: np.asarray(norm.fit(x)), axis=0)

# label_params = train_df.groupby(['label']).apply(get_params)

In [27]:
# label_params.loc[0, :].loc[0]

## Naive Bayes - normal distribution - untouched

In [28]:
# params_ = np.apply_along_axis(lambda x: norm.fit(x), 1, train_images.T)

In [29]:
def calculate_likelihood_for_each_label(p_label, feature_vec, params):
    means = params.loc[0]
    stds = params.loc[1]
    likelihood = np.nansum(norm.logpdf(feature_vec, means, stds))
    likelihood = likelihood + np.log(p_label['probability'])
    return np.array([p_label['label'], likelihood])
    
def get_predict(likelihoods):
    max_row = [float("-inf"), float("-inf")]
    for likelihood in likelihoods:
        if(likelihood[1] > max_row[1]):
            max_row = likelihood
    return max_row[0]

### Evaluate

In [39]:
def predict(image, label_params):
    likelihoods = []
    for index, p_train_label in p_train_labels.iterrows():
        params = label_params.loc[p_train_label['label'], :]
        likelihoods.append(calculate_likelihood_for_each_label(p_train_label, image, params))
        
    return get_predict(np.array(likelihoods))

# predict(test_images.loc[1])


In [31]:
#predicts = test_images.progress_apply(predict, axis=1)

In [32]:
def calculate_accuracy(actual, predicts):
    TP = 0
    num_total = len(actual)
    for i in range(num_total):
        if actual[i] == predicts[i]:
            TP = TP + 1
    return TP/num_total

In [33]:
# calculate_accuracy(np.array(test_labels), np.array(predicts))

## Image processing

In [49]:
def stretch_image(ori_image):
    img = Image.fromarray(np.array(ori_image).reshape(28, 28).astype('uint8'))
    cropped = img.crop(img.getbbox())
    stretched = cropped.resize((28,28))
    #     imgplot = plt.imshow(stretched)
    return pd.Series(np.array(stretched).reshape(ori_image.shape))
    
def stretch_images(ori_images):
    print("Stretch images")
    return ori_images.progress_apply(stretch_image, axis=1)

In [35]:
# stretch_image(test_images.loc[0])

## Entry point

In [41]:
def accuracy(distribution, stretched, train_images, test_images):
    if(stretched):
        train_images = stretch_images(train_images)
        test_images = stretch_images(test_images)
        train_df = train_images.join(train_labels)
    
    train_df = train_images.join(train_labels)
    label_params = train_df.groupby(['label']).apply(get_params)
    assert label_params.shape == (20, 784)

    print('Get predicts...')
    predicts = test_images.progress_apply(predict, args=(label_params, ), axis=1)
    return calculate_accuracy(np.array(test_labels), np.array(predicts))

In [42]:
accuracy_norm_origin = accuracy('norm', False, train_images, test_images)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


In [43]:
print(accuracy_norm_origin)

0.6257


In [44]:
# accurracy('bernoulli', False)

In [50]:
accuracy_norm_stretched = accuracy('norm', True, train_images, test_images)

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [46]:
print(accuracy_norm_stretched)

0.837


In [None]:
# accurracy('bernoulli', True)