In [1]:
import numpy as np
import pandas as pd

Load the data

In [2]:
mnist_data = pd.read_csv('../Data/mnist.data')

In [3]:
features = mnist_data.columns[1:]

In [4]:
num_lab = mnist_data.label.unique().shape[0]  # 10
num_feat = mnist_data.columns.shape[0] - 1  # 784 Becuase labels is one of the columns
num_cat = 256  # No of values each feature can take

Create an empty data frame to hold the probabilities for each value that a feature takes for all features and classes. 

In [5]:
lab_ind = np.arange(num_lab).repeat(num_cat)
feat_ind = np.arange(num_feat)
cat_ind = np.tile(np.arange(num_cat), num_lab)

# Create DataFrame
probabilities = pd.DataFrame(np.empty(shape=(num_lab * num_cat, num_feat)), index=[lab_ind, cat_ind], columns=features)
probabilities.index.names = ['labels', 'categories']
probabilities.columns.names = ['features']

Group data by classes and extract probabilities

In [6]:
grouped_data = mnist_data.groupby('label', axis=0)
group_counts = grouped_data.count()['pixel0']
smoothed_counts = group_counts + num_cat
smoothed_counts = pd.DataFrame(np.repeat(smoothed_counts.values, num_cat), index=probabilities.index, dtype=np.float64)
for label, label_data in grouped_data:
    for feature in features:
        probabilities.loc[label][feature] = label_data[feature].value_counts()
probabilities = probabilities.fillna(0)
probabilities += 1 # Laplace smoothing
probabilities = pd.DataFrame(-np.log(probabilities.values/smoothed_counts.values), index=probabilities.index, columns=probabilities.columns)

In [7]:
prior = grouped_data.count()/mnist_data['label'].count()
prior = pd.Series(-np.log(prior['pixel0'].values))

In [8]:
def predict_point(x):
    class_cond_prob = np.zeros(shape=num_lab)
    for i, val in enumerate(x):
        class_cond_prob += probabilities[features[i]].loc[:, val]
    class_cond_prob += prior
    return np.argmax(class_cond_prob)

In [9]:
def predict(test_data):
    predicted_labels = np.empty(test_data.shape[0])
    for index, point in enumerate(test_data):
        predicted_labels[index] = predict_point(point)
        
    return predicted_labels

In [10]:
histogram = probabilities.values.reshape((num_lab, -1, num_feat)).swapaxes(1,2)

In [11]:
data = np.genfromtxt('../Data/mnist.data', delimiter=',', names=True, dtype=np.uint8)

In [12]:
data_view = data.view(np.uint8).reshape(data.shape + (-1,))

In [13]:
classes = np.unique(data['label'])

In [27]:
x=np.random.random_integers(0, 255, 784)
ccp = histogram[:, np.arange(num_feat), x].sum(axis=1) + prior.values
np.argmax(ccp)

1

In [22]:
np.argmax(ccp)

1

In [None]:
histogram.shape