In [17]:
import scipy.io as sio

mat_contents = sio.loadmat("hw1data.mat")
mat_contents

{'X': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'Y': array([[9],
        [5],
        [9],
        ..., 
        [4],
        [0],
        [9]], dtype=uint8), '__globals__': [], '__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Sat May 30 00:42:33 2015', '__version__': '1.0'}

In [18]:
import numpy as np
X = np.asarray(mat_contents['X'], dtype=np.int32) # Data Matrix
Y = np.asarray(mat_contents['Y'], dtype=np.int32)# Labels
X.shape

(10000, 784)

### Preprocessing dataset X

In [19]:
## Normalize data

import math
def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def normalize_dataset(dataset):
    columns = dataset.shape[1]
    temp_dataset = []
    for i in range(columns):
        temp = list(dataset[:, i])
        minimum = min(temp)
        maximum = max(temp)
        for j in range(len(temp)):
            temp[j] = (temp[j] - minimum) / (maximum - minimum)
        temp_dataset.append(temp)

    normalized_dataset = np.array(temp_dataset).transpose()
    return normalized_dataset


In [20]:
# First, prepare the training set and the testing

import random, numpy

numpy.random.shuffle(X)
rows = X.shape[0] * 0.8
training, trainingLabels = X[:rows,:], Y[:rows,:]
testing, testingLabels = X[rows:,:], Y[rows:,:]




In [21]:
# Separate data points by labels(class)
def separateByClass(dataset, labels):
    separated = {}
    for i in range(dataset.shape[0]):
        vector = dataset[i]
        if (labels[i][0] not in separated):
            separated[labels[i][0]] = []
        separated[labels[i][0]].append(vector)
    return separated
                   
#resultsSeparatedByClass = separateByClass(X,Y)
#resultsSeparatedByClass

### Sumarize dataset

In [22]:
import heapq
import numpy as np

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    return summaries

summaries = summarize(X)
variances = np.empty(784)
for i in range(len(summaries)):
    variances[i] = summaries[i][1]

In [23]:
best_feature_indice = (variances).argsort()[-200:][::-1]
#print(best_feature_indice)

## After selecting the most valuable features
trainingSet = training[:, best_feature_indice]
testingSet = testing[:, best_feature_indice]

normalized_trainingSet = normalize_dataset(trainingSet)
normalized_testingSet = normalize_dataset(testingSet)

optimal_summaries = summarize(trainingSet)
#optimal_summaries

In [24]:
## Create Covariance Matrix

covariance = np.cov(normalized_trainingSet)
print(covariance)

[[ 0.12123941 -0.02887693 -0.02461541 ...,  0.01914157 -0.02058662
  -0.02200935]
 [-0.02887693  0.17093263 -0.01358619 ...,  0.00157945  0.05696716
   0.09074545]
 [-0.02461541 -0.01358619  0.18657565 ...,  0.02062971 -0.00111427
   0.00755157]
 ..., 
 [ 0.01914157  0.00157945  0.02062971 ...,  0.20104968 -0.00024646
  -0.00322851]
 [-0.02058662  0.05696716 -0.00111427 ..., -0.00024646  0.17162478
   0.10579937]
 [-0.02200935  0.09074545  0.00755157 ..., -0.00322851  0.10579937
   0.16984081]]


In [25]:
def summarizeByClass(dataset, labels):
    separated = separateByClass(dataset, labels)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

summaries = summarizeByClass(normalized_trainingSet, trainingLabels)
summaries

{0: [(0.44940280680800382, 0.44668077977998455),
  (0.52989449586941539, 0.44735405357746316),
  (0.47802826714442254, 0.43894502678721115),
  (0.45669354036030735, 0.4454508048838727),
  (0.50779834776550348, 0.4429174639924557),
  (0.46207325569821928, 0.44621100554525644),
  (0.50225938090972566, 0.4433323055381414),
  (0.48607046879665672, 0.44233842475972285),
  (0.50185129889519353, 0.4389206461418506),
  (0.562351945854485, 0.43711384498503353),
  (0.53121827411167721, 0.43395035009082045),
  (0.46058524932815859, 0.4407405967244411),
  (0.47317607245944221, 0.4371227806977543),
  (0.42684881059022667, 0.44263548972844086),
  (0.46066487508709175, 0.4392551090528806),
  (0.44963670747486939, 0.43498754777619497),
  (0.47786901562655615, 0.439226070133173),
  (0.47840648949935405, 0.4378968095257751),
  (0.47820742510202141, 0.44076776223338243),
  (0.49266447695829713, 0.4462453362917378),
  (0.49190305563850006, 0.4385189399568686),
  (0.49093759331143744, 0.4374350119016489),


In [26]:
### Calculate probability for each data point

In [27]:
import math
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [28]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [29]:
# Given an input Vector
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [30]:
# predict

# print(predict(summaries, normalized_testingSet[8]))
# print(testLabels[8])

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

predictions = getPredictions(summaries, normalized_testingSet)

In [31]:
### get Accuracy

def getAccuracy(labels, predictions):
    correct = 0
    for x in range(len(labels)):
        if labels[x] == predictions[x]:
            correct += 1
    return (correct/float(len(labels))) * 100.0

getAccuracy(testingLabels, predictions)

9.55