In [42]:
import os,urllib.request
datapath = '../Data/MNISTData/'
if not os.path.exists(datapath):
    os.makedirs(datapath)

urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

for url in urls:
    filename = url.split('/')[-1]
    if os.path.exists(datapath+filename):
        print(filename, ' already exists')
    else:
        print('Downloading ',filename)
        urllib.request.urlretrieve (url, datapath+filename)
     
print('All files are downloaded for MNIST dataset')




Downloading  train-images-idx3-ubyte.gz
Downloading  train-labels-idx1-ubyte.gz
Downloading  t10k-images-idx3-ubyte.gz
Downloading  t10k-labels-idx1-ubyte.gz
All files are downloaded for MNIST dataset


In [43]:
import os,gzip,shutil

datapath = '../Data/MNISTData/'  
files = os.listdir(datapath)
for file in files:
    if file.endswith('gz'):
        print('Extracting ',file)
        with gzip.open(datapath+file, 'rb') as f_in:
            with open(datapath+file.split('.')[0], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
print('Extraction Complete')

for file in files:
    print('Removing ',file)
    os.remove(datapath+file)
print ('All archives removed')


Extracting  train-images-idx3-ubyte.gz
Extracting  train-labels-idx1-ubyte.gz
Extracting  t10k-labels-idx1-ubyte.gz
Extracting  t10k-images-idx3-ubyte.gz
Extraction Complete
Removing  train-images-idx3-ubyte.gz
Removing  train-labels-idx1-ubyte.gz
Removing  t10k-labels-idx1-ubyte.gz
Removing  t10k-images-idx3-ubyte.gz
All archives removed


In [44]:
import os,codecs
import numpy as np

datapath = '../Data/MNISTData/'
files = os.listdir(datapath)

def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)

data_dict = {}
for file in files:
    if file.endswith('ubyte'):
        print('Reading ',file)
        with open (datapath+file,'rb') as f:
            data = f.read()
            type = get_int(data[:4])
            length = get_int(data[4:8])
            if (type == 2051):
                category = 'images'
                num_rows = get_int(data[8:12])
                num_cols = get_int(data[12:16])
                parsed = np.frombuffer(data,dtype = np.uint8, offset = 16)
                #parsed = parsed.reshape(length,num_rows,num_cols)
                parsed = parsed.reshape(length,num_rows*num_cols)
            elif(type == 2049):
                category = 'labels'
                parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
                parsed = parsed.reshape(length)
            if (length==10000):
                set = 'test'
            elif (length==60000):
                set = 'train'
            data_dict[set+'_'+category] = parsed

print(data_dict.keys())
#print(np.size(data_dict['train_images']))

Reading  train-images-idx3-ubyte
Reading  t10k-images-idx3-ubyte
Reading  train-labels-idx1-ubyte
Reading  t10k-labels-idx1-ubyte
dict_keys(['train_images', 'test_images', 'train_labels', 'test_labels'])


In [45]:

train_images = data_dict['train_images']
train_labels = data_dict['train_labels']
no_of_samples = train_images.shape[0]

#train0 = []
#train1 = []
#train2 = []
#train3 = []
#train4 = []
#train5 = []
#train6 = []
#train7 = []
#train8 = []
#train9 = []


train0 = train_images[np.where(train_labels[:] == 0)]
train1 = train_images[np.where(train_labels[:] == 1)]
train2 = train_images[np.where(train_labels[:] == 2)]
train3 = train_images[np.where(train_labels[:] == 3)]
train4 = train_images[np.where(train_labels[:] == 4)]
train5 = train_images[np.where(train_labels[:] == 5)]
train6 = train_images[np.where(train_labels[:] == 6)]
train7 = train_images[np.where(train_labels[:] == 7)]
train8 = train_images[np.where(train_labels[:] == 8)]
train9 = train_images[np.where(train_labels[:] == 9)]
print('Number of images trained as dfadsfa 0: ', np.size(train0)/784)
print('Number of images trained as dfadsfa 1: ', np.size(train1)/784)
print('Number of images trained as dfadsfa 2: ', np.size(train2)/784)
print('Number of images trained as dfadsfa 3: ', np.size(train3)/784)
print('Number of images trained as dfadsfa 4: ', np.size(train4)/784)
print('Number of images trained as dfadsfa 5: ', np.size(train5)/784)
print('Number of images trained as dfadsfa 6: ', np.size(train6)/784)
print('Number of images trained as dfadsfa 7: ', np.size(train7)/784)
print('Number of images trained as dfadsfa 8: ', np.size(train8)/784)
print('Number of images trained as dfadsfa 9: ', np.size(train9)/784)

print(np.size(train9[0]))



Number of images trained as dfadsfa 0:  5923.0
Number of images trained as dfadsfa 1:  6742.0
Number of images trained as dfadsfa 2:  5958.0
Number of images trained as dfadsfa 3:  6131.0
Number of images trained as dfadsfa 4:  5842.0
Number of images trained as dfadsfa 5:  5421.0
Number of images trained as dfadsfa 6:  5918.0
Number of images trained as dfadsfa 7:  6265.0
Number of images trained as dfadsfa 8:  5851.0
Number of images trained as dfadsfa 9:  5949.0
784


In [46]:
P_0 = np.log((np.size(train0)/784)/no_of_samples)
P_1 = np.log((np.size(train1)/784)/no_of_samples)
P_2 = np.log((np.size(train2)/784)/no_of_samples)
P_3 = np.log((np.size(train3)/784)/no_of_samples)
P_4 = np.log((np.size(train4)/784)/no_of_samples)
P_5 = np.log((np.size(train5)/784)/no_of_samples)
P_6 = np.log((np.size(train6)/784)/no_of_samples)
P_7 = np.log((np.size(train7)/784)/no_of_samples)
P_8 = np.log((np.size(train8)/784)/no_of_samples)
P_9 = np.log((np.size(train9)/784)/no_of_samples)


print('Probability of 0 is: ', P_0)
print('Probability of 1: ', P_1)
print('Probability of 2: ', P_2)
print('Probability of 3: ', P_3)
print('Probability of 4: ', P_4)
print('Probability of 5: ', P_5)
print('Probability of 6: ', P_6)
print('Probability of 7: ', P_7)
print('Probability of 8: ', P_8)
print('Probability of 9: ', P_9)

Probability of 0 is:  -2.315501484927273
Probability of 1:  -2.185987945410232
Probability of 2:  -2.3096097079310103
Probability of 3:  -2.28098669344155
Probability of 4:  -2.3292713582565514
Probability of 5:  -2.4040642619438994
Probability of 6:  -2.3163460082440825
Probability of 7:  -2.2593659738740692
Probability of 8:  -2.3277319754160017
Probability of 9:  -2.3111214240163322


In [47]:
import numpy as np

"""
t0 = np.asarray(train0)
t1 = np.asarray(train1)
t2 = np.asarray(train2)
t3 = np.asarray(train3)
t4 = np.asarray(train4)
t5 = np.asarray(train5)
t6 = np.asarray(train6)
t7 = np.asarray(train7)
t8 = np.asarray(train8)
t9 = np.asarray(train9)


mean_0 = []
mean_1 = []
mean_2 = []
mean_3 = []
mean_4 = []
mean_5 = []
mean_6 = []
mean_7 = []
mean_8 = []
mean_9 = []

var_0 = []
var_1 = []
var_2 = []
var_3 = []
var_4 = []
var_5 = []
var_6 = []
var_7 = []
var_8 = []
var_9 = []
"""
mean_0 = (np.mean(train0, axis =0))
mean_1 = (np.mean(train1, axis =0))
mean_2 = (np.mean(train2, axis =0))
mean_3 = (np.mean(train3, axis =0))
mean_4 = (np.mean(train4, axis =0))
mean_5 = (np.mean(train5, axis =0))
mean_6 = (np.mean(train6, axis =0))
mean_7 = (np.mean(train7, axis =0))
mean_8 = (np.mean(train8, axis =0))
mean_9 = (np.mean(train9, axis =0))
var_0 = (np.std(train0, axis =0))
var_1 = (np.std(train1, axis =0))
var_2 = (np.std(train2, axis =0))
var_3 = (np.std(train3, axis =0))
var_4 = (np.std(train4, axis =0))
var_5 = (np.std(train5, axis =0))
var_6 = (np.std(train6, axis =0))
var_7 = (np.std(train7, axis =0))
var_8 = (np.std(train8, axis =0))
var_9 = (np.std(train9, axis =0))

nonzerovarrow0 = np.where(var_0[:]!=0) 
nonzerovarrow1 = np.where(var_1[:]!=0) 
nonzerovarrow2 = np.where(var_2[:]!=0) 
nonzerovarrow3 = np.where(var_3[:]!=0) 
nonzerovarrow4 = np.where(var_4[:]!=0) 
nonzerovarrow5 = np.where(var_5[:]!=0) 
nonzerovarrow6 = np.where(var_6[:]!=0) 
nonzerovarrow7 = np.where(var_7[:]!=0) 
nonzerovarrow8 = np.where(var_8[:]!=0)
nonzerovarrow9 = np.where(var_9[:]!=0) 

"""
    if(np.var(train0[:,i]) > 0):
        var_0.append(np.var(train0[:,i]))
    else:
        var_0.append(0.000000000000001)
    if(np.var(train1[:,i]) > 0):
        var_1.append(np.var(train1[:,i]))
    else:
        var_1.append(0.000000000000001)
    if(np.var(train2[:,i]) > 0):
        var_2.append(np.var(train2[:,i]))
    else:
        var_2.append(0.000000000000001)
    if(np.var(train3[:,i]) > 0):
        var_3.append(np.var(train3[:,i]))
    else:
        var_3.append(0.000000000000001)
    if(np.var(train4[:,i]) > 0):
        var_4.append(np.var(train4[:,i]))
    else:
        var_4.append(0.000000000000001)
    if(np.var(train5[:,i]) > 0):
        var_5.append(np.var(train5[:,i]))
    else:
        var_5.append(0.000000000000001)
    if(np.var(train6[:,i]) > 0):
        var_6.append(np.var(train6[:,i]))
    else:
        var_6.append(0.000000000000001)
    if(np.var(train7[:,i]) > 0):
        var_7.append(np.var(train7[:,i]))
    else:
        var_7.append(0.000000000000001)
    if(np.var(train8[:,i]) > 0):
        var_8.append(np.var(train8[:,i]))
    else:
        var_8.append(0.000000000000001)
    if(np.var(train9[:,i]) > 0):
        var_9.append(np.var(train9[:,i]))
    else:
        var_9.append(0.000000000000001)

sd_0 = map(np.sqrt,var_0) 
sd_1 = map(np.sqrt,var_1) 
sd_2 = map(np.sqrt,var_2) 
sd_3 = map(np.sqrt,var_3) 
sd_4 = map(np.sqrt,var_4) 
sd_5 = map(np.sqrt,var_5) 
sd_6 = map(np.sqrt,var_6) 
sd_7 = map(np.sqrt,var_7) 
sd_8 = map(np.sqrt,var_8) 
sd_9 = map(np.sqrt,var_9) 
"""
print('Mean and Variance generated successfully !')


Mean and Variance generated successfully !


In [54]:
from scipy.stats import norm

test_images = data_dict['test_images']
test_labels = data_dict['test_labels']
no_of_tests = test_images.shape[0]

"""
images = []

for i in range (no_of_tests):
    each_image = []
    for j in range (len(test_images[i])):
        for k in range (len(test_images[i])):
            each_image.append(test_images[i][j][k])
    images.append(each_image)
"""      
#TEST_IMAGES = np.asarray(images)
TEST_IMAGES = test_images


print('Number of Tests: ', np.size(TEST_IMAGES[0]))

accuracy = 0

def getHighestProbability(probs):
    max = probs[0]
    res = 0
    for i in range(1,10):
        if probs[i] > max:
            max = probs[i]
            res = i
    return res

for i in range(0,no_of_tests):
    x = TEST_IMAGES[i,:]
    
    
    prob0 =P_0+sum(norm.logpdf(x[nonzerovarrow0], mean_0[nonzerovarrow0], var_0[nonzerovarrow0]))
    prob1 =P_1+sum(norm.logpdf(x[nonzerovarrow1], mean_1[nonzerovarrow1], var_1[nonzerovarrow1]))
    prob2 =P_2+sum(norm.logpdf(x[nonzerovarrow2], mean_2[nonzerovarrow2], var_2[nonzerovarrow2]))
    prob3 =P_3+sum(norm.logpdf(x[nonzerovarrow3], mean_3[nonzerovarrow3], var_3[nonzerovarrow3]))
    prob4 =P_4+sum(norm.logpdf(x[nonzerovarrow4], mean_4[nonzerovarrow4], var_4[nonzerovarrow4]))
    prob5 =P_5+sum(norm.logpdf(x[nonzerovarrow5], mean_5[nonzerovarrow5], var_5[nonzerovarrow5]))
    prob6 =P_6+sum(norm.logpdf(x[nonzerovarrow6], mean_6[nonzerovarrow6], var_6[nonzerovarrow6]))
    prob7 =P_7+sum(norm.logpdf(x[nonzerovarrow7], mean_7[nonzerovarrow7], var_7[nonzerovarrow7]))
    prob8 =P_8+sum(norm.logpdf(x[nonzerovarrow8], mean_8[nonzerovarrow8], var_8[nonzerovarrow8]))
    prob9 =P_9+sum(norm.logpdf(x[nonzerovarrow9], mean_9[nonzerovarrow9], var_9[nonzerovarrow9]))    

    """
    prob0+=sum(map(norm.logpdf,x[nonzerovarrow0], mean_0[nonzerovarrow0], var_0[nonzerovarrow0]))
    prob1+=sum(map(norm.logpdf,x[nonzerovarrow1], mean_1[nonzerovarrow1], var_1[nonzerovarrow1]))
    prob2+=sum(map(norm.logpdf,x[nonzerovarrow2], mean_2[nonzerovarrow2], var_2[nonzerovarrow2]))
    prob3+=sum(map(norm.logpdf,x[nonzerovarrow3], mean_3[nonzerovarrow3], var_3[nonzerovarrow3]))
    prob4+=sum(map(norm.logpdf,x[nonzerovarrow4], mean_4[nonzerovarrow4], var_4[nonzerovarrow4]))
    prob5+=sum(map(norm.logpdf,x[nonzerovarrow5], mean_5[nonzerovarrow5], var_5[nonzerovarrow5]))
    prob6+=sum(map(norm.logpdf,x[nonzerovarrow6], mean_6[nonzerovarrow6], var_6[nonzerovarrow6]))
    prob7+=sum(map(norm.logpdf,x[nonzerovarrow7], mean_7[nonzerovarrow7], var_7[nonzerovarrow7]))
    prob8+=sum(map(norm.logpdf,x[nonzerovarrow8], mean_8[nonzerovarrow8], var_8[nonzerovarrow8]))
    prob9+=sum(map(norm.logpdf,x[nonzerovarrow9], mean_9[nonzerovarrow9], var_9[nonzerovarrow9]))    
    
    
    #print(TEST_IMAGES[i,:])
    prob0+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_0, sd_0))
    
    prob1+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_1, sd_1))
    prob1+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_1, sd_1))
    prob2+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_2, sd_2))
    prob3+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_3, sd_3))
    prob4+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_4, sd_4))
    prob5+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_5, sd_5))
    prob6+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_6, sd_6))
    prob7+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_7, sd_7))
    prob7+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_8, sd_8))
    prob9+=sum(map(norm.logpdf,TEST_IMAGES[i,:], mean_9, sd_9))
    """          
    res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])
  
    if(res == test_labels[i]):
        accuracy += 1
    if i%1000 == 0:
        print(i)

print('Accuracy: ', accuracy/no_of_tests)

Number of Tests:  784
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
Accuracy:  0.6483
