In [1]:
import os,urllib.request
import gzip, shutil, codec

datapath = '../Data/MNISTData/'
if not os.path.exists(datapath):
    os.makedirs(datapath)

urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

for url in urls:
    filename = url.split('/')[-1]
    if os.path.exists(datapath+filename):
        print(filename, ' already exists')
    else:
        print('Downloading ',filename)
        urllib.request.urlretrieve (url, datapath+filename)
     
print('All files are downloaded for MNIST dataset')


ModuleNotFoundError: ignored

**Unzip the Downloaded Files**

In [0]:
datapath = '../Data/MNISTData/'  
files = os.listdir(datapath)
for file in files:
    if file.endswith('gz'):
        print('Extracting ',file)
        with gzip.open(datapath+file, 'rb') as f_in:
            with open(datapath+file.split('.')[0], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
print('Extraction Complete')

for file in files:
    print('Removing ',file)
    os.remove(datapath+file)
print ('All archives removed')

**Read the Data**

In [0]:
import numpy as np

datapath = '../Data/MNISTData/'
files = os.listdir(datapath)

def get_int(b):
  return int(codecs.encode(b, 'hex'), 16)

data_dict = {}
for file in files:
    if file.endswith('ubyte'):
        print('Reading ',file)
        with open (datapath+file,'rb') as f:
            data = f.read()
            type = get_int(data[:4])
            length = get_int(data[4:8])
            if (type == 2051):
                category = 'images'
                num_rows = get_int(data[8:12])
                num_cols = get_int(data[12:16])
                parsed = np.frombuffer(data,dtype = np.uint8, offset = 16)
                parsed = parsed.reshape(length,num_rows*num_cols)
            elif(type == 2049):
                category = 'labels'
                parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
                parsed = parsed.reshape(length)
            if (length==10000):
                set = 'test'
            elif (length==60000):
                set = 'train'
            data_dict[set+'_'+category] = parsed

print(data_dict.keys())

**Create Stretched Image for Training Data**

In [0]:
from PIL import Image
#from PIL import ImageDraw
#import matplotlib.pyplot as plt
import cv2

  
train_images = data_dict['train_images']
train_labels = data_dict['train_labels']

train0 = train_images[np.where(train_labels[:] == 0)]
train1 = train_images[np.where(train_labels[:] == 1)]
train2 = train_images[np.where(train_labels[:] == 2)]
train3 = train_images[np.where(train_labels[:] == 3)]
train4 = train_images[np.where(train_labels[:] == 4)]
train5 = train_images[np.where(train_labels[:] == 5)]
train6 = train_images[np.where(train_labels[:] == 6)]
train7 = train_images[np.where(train_labels[:] == 7)]
train8 = train_images[np.where(train_labels[:] == 8)]
train9 = train_images[np.where(train_labels[:] == 9)]

no_of_samples = train_images.shape[0]

STRETCHED_TRAIN_IMAGES = train_images.reshape(no_of_samples, 28, 28)

for i in range (no_of_samples):
  
  min_j, min_k, max_j, max_k = 255,255,0,0;
  y=[]

  for j in range (28):
    for k in range (28):
      if(STRETCHED_TRAIN_IMAGES[i][j][k] > 0):
        if(j < min_j):
          min_j = j
        if(j > max_j):
           max_j = j
        if(k < min_k):
          min_k = k
        if(k > max_k):
          max_k = k
  
  for j in range (min_j, max_j+1):
    x=[]
    for k in range (min_k, max_k+1):
       x.append(STRETCHED_TRAIN_IMAGES[i][j][k])
    y.append(x)
  
  arr = np.array(y)
  resized = cv2.resize(arr, (20,20), interpolation = cv2.INTER_NEAREST)

  #img = Image.fromarray(resized)
  #THIS FUNCTION NOT WORKING
  #img.resize((20,20))
  #a = np.asarray(img)
  #print(np.size(a))
  #plt.imshow(img)
  
  if i == 0:
    train_stretched = resized.flatten().reshape(1,400)
  else:
    train_stretched = np.append(train_stretched, resized.flatten().reshape(1,400), axis = 0)

    
train0_stretched = train_stretched[np.where(train_labels[:] == 0)]
train1_stretched = train_stretched[np.where(train_labels[:] == 1)]
train2_stretched = train_stretched[np.where(train_labels[:] == 2)]
train3_stretched = train_stretched[np.where(train_labels[:] == 3)]
train4_stretched = train_stretched[np.where(train_labels[:] == 4)]
train5_stretched = train_stretched[np.where(train_labels[:] == 5)]
train6_stretched = train_stretched[np.where(train_labels[:] == 6)]
train7_stretched = train_stretched[np.where(train_labels[:] == 7)]
train8_stretched = train_stretched[np.where(train_labels[:] == 8)]
train9_stretched = train_stretched[np.where(train_labels[:] == 9)]
    
    
print('Untouched and stretched images generated successfully !')


**Create Stretched Image for Testing Data**

In [0]:
test_images = data_dict['train_images']
test_labels = data_dict['train_labels']

no_of_tests = test_images.shape[0]

STRETCHED_TEST_IMAGES = test_images.reshape(no_of_tests, 28, 28)

for i in range (no_of_tests):
  
  min_j, min_k, max_j, max_k = 255,255,0,0;
  y=[]

  for j in range (28):
    for k in range (28):
      if(STRETCHED_TEST_IMAGES[i][j][k] > 0):
        if(j < min_j):
          min_j = j
        if(j > max_j):
           max_j = j
        if(k < min_k):
          min_k = k
        if(k > max_k):
          max_k = k
  
  for j in range (min_j, max_j+1):
    x=[]
    for k in range (min_k, max_k+1):
       x.append(STRETCHED_TEST_IMAGES[i][j][k])
    y.append(x)
  
  arr = np.array(y)
  resized = cv2.resize(arr, (20,20), interpolation = cv2.INTER_NEAREST)

  #img = Image.fromarray(resized)
  #THIS FUNCTION NOT WORKING
  #img.resize((20,20))
  #a = np.asarray(img)
  #print(np.size(a))
  #plt.imshow(img)
  
  if i == 0:
    test_stretched = resized.flatten().reshape(1,400)
  else:
    test_stretched = np.append(test_stretched, resized.flatten().reshape(1,400), axis = 0)
  
    
    
print('Untouched and stretched images generated successfully !')


**Generate Prior for Untouched Image**

In [0]:
image_size = np.size(train0[0])

PU_0 = np.log((np.size(train0)/image_size)/no_of_samples)
PU_1 = np.log((np.size(train1)/image_size)/no_of_samples)
PU_2 = np.log((np.size(train2)/image_size)/no_of_samples)
PU_3 = np.log((np.size(train3)/image_size)/no_of_samples)
PU_4 = np.log((np.size(train4)/image_size)/no_of_samples)
PU_5 = np.log((np.size(train5)/image_size)/no_of_samples)
PU_6 = np.log((np.size(train6)/image_size)/no_of_samples)
PU_7 = np.log((np.size(train7)/image_size)/no_of_samples)
PU_8 = np.log((np.size(train8)/image_size)/no_of_samples)
PU_9 = np.log((np.size(train9)/image_size)/no_of_samples)


print('Successfully generated probabilities !')

**Generate Prior for Stretched Image**

In [0]:
PS_0 = np.log((np.size(train0_stretched)/image_size)/no_of_samples)
PS_1 = np.log((np.size(train1_stretched)/image_size)/no_of_samples)
PS_2 = np.log((np.size(train2_stretched)/image_size)/no_of_samples)
PS_3 = np.log((np.size(train3_stretched)/image_size)/no_of_samples)
PS_4 = np.log((np.size(train4_stretched)/image_size)/no_of_samples)
PS_5 = np.log((np.size(train5_stretched)/image_size)/no_of_samples)
PS_6 = np.log((np.size(train6_stretched)/image_size)/no_of_samples)
PS_7 = np.log((np.size(train7_stretched)/image_size)/no_of_samples)
PS_8 = np.log((np.size(train8_stretched)/image_size)/no_of_samples)
PS_9 = np.log((np.size(train9_stretched)/image_size)/no_of_samples)


print('Successfully generated probabilities (Stretched) !')

**Generate Gausian Parameter for Untouched Image**

In [0]:
mean_0 = (np.mean(train0, axis =0))
mean_1 = (np.mean(train1, axis =0))
mean_2 = (np.mean(train2, axis =0))
mean_3 = (np.mean(train3, axis =0))
mean_4 = (np.mean(train4, axis =0))
mean_5 = (np.mean(train5, axis =0))
mean_6 = (np.mean(train6, axis =0))
mean_7 = (np.mean(train7, axis =0))
mean_8 = (np.mean(train8, axis =0))
mean_9 = (np.mean(train9, axis =0))
var_0 = (np.std(train0, axis =0))
var_1 = (np.std(train1, axis =0))
var_2 = (np.std(train2, axis =0))
var_3 = (np.std(train3, axis =0))
var_4 = (np.std(train4, axis =0))
var_5 = (np.std(train5, axis =0))
var_6 = (np.std(train6, axis =0))
var_7 = (np.std(train7, axis =0))
var_8 = (np.std(train8, axis =0))
var_9 = (np.std(train9, axis =0))

nonzerovarrow0 = np.where(var_0[:]!=0) 
nonzerovarrow1 = np.where(var_1[:]!=0) 
nonzerovarrow2 = np.where(var_2[:]!=0) 
nonzerovarrow3 = np.where(var_3[:]!=0) 
nonzerovarrow4 = np.where(var_4[:]!=0) 
nonzerovarrow5 = np.where(var_5[:]!=0) 
nonzerovarrow6 = np.where(var_6[:]!=0) 
nonzerovarrow7 = np.where(var_7[:]!=0) 
nonzerovarrow8 = np.where(var_8[:]!=0)
nonzerovarrow9 = np.where(var_9[:]!=0) 

print('Mean and Variance generated successfully !')

**Generate Gaussian Parameters for Stretched Image**

In [0]:
meanS_0 = (np.mean(train0_stretched, axis =0))
meanS_1 = (np.mean(train1_stretched, axis =0))
meanS_2 = (np.mean(train2_stretched, axis =0))
meanS_3 = (np.mean(train3_stretched, axis =0))
meanS_4 = (np.mean(train4_stretched, axis =0))
meanS_5 = (np.mean(train5_stretched, axis =0))
meanS_6 = (np.mean(train6_stretched, axis =0))
meanS_7 = (np.mean(train7_stretched, axis =0))
meanS_8 = (np.mean(train8_stretched, axis =0))
meanS_9 = (np.mean(train9_stretched, axis =0))
varS_0 = (np.std(train0_stretched, axis =0))
varS_1 = (np.std(train1_stretched, axis =0))
varS_2 = (np.std(train2_stretched, axis =0))
varS_3 = (np.std(train3_stretched, axis =0))
varS_4 = (np.std(train4_stretched, axis =0))
varS_5 = (np.std(train5_stretched, axis =0))
varS_6 = (np.std(train6_stretched, axis =0))
varS_7 = (np.std(train7_stretched, axis =0))
varS_8 = (np.std(train8_stretched, axis =0))
varS_9 = (np.std(train9_stretched, axis =0))

nonzerovarrowS0 = np.where(varS_0[:]!=0) 
nonzerovarrowS1 = np.where(varS_1[:]!=0) 
nonzerovarrowS2 = np.where(varS_2[:]!=0) 
nonzerovarrowS3 = np.where(varS_3[:]!=0) 
nonzerovarrowS4 = np.where(varS_4[:]!=0) 
nonzerovarrowS5 = np.where(varS_5[:]!=0) 
nonzerovarrowS6 = np.where(varS_6[:]!=0) 
nonzerovarrowS7 = np.where(varS_7[:]!=0) 
nonzerovarrowS8 = np.where(varS_8[:]!=0)
nonzerovarrowS9 = np.where(varS_9[:]!=0) 

print('Mean and Variance generated (for Stretched) successfully !')

**Helping function to get max probability for Gausian Distribution**

In [0]:
from scipy.stats import norm

def getHighestProbability(probs):
  max = probs[0]
  res = 0
  for i in range(1,10):
    if probs[i] > max:
      max = probs[i]
      res = i
  return res



**Prediction on Training Set Using Gausian on Untouched Images**

In [0]:
accuracy = 0
for i in range(0,no_of_samples):
  x = train_images[i,:]
  
  prob0 =PU_0+sum(norm.logpdf(x[nonzerovarrow0], mean_0[nonzerovarrow0], var_0[nonzerovarrow0]))
  prob1 =PU_1+sum(norm.logpdf(x[nonzerovarrow1], mean_1[nonzerovarrow1], var_1[nonzerovarrow1]))
  prob2 =PU_2+sum(norm.logpdf(x[nonzerovarrow2], mean_2[nonzerovarrow2], var_2[nonzerovarrow2]))
  prob3 =PU_3+sum(norm.logpdf(x[nonzerovarrow3], mean_3[nonzerovarrow3], var_3[nonzerovarrow3]))
  prob4 =PU_4+sum(norm.logpdf(x[nonzerovarrow4], mean_4[nonzerovarrow4], var_4[nonzerovarrow4]))
  prob5 =PU_5+sum(norm.logpdf(x[nonzerovarrow5], mean_5[nonzerovarrow5], var_5[nonzerovarrow5]))
  prob6 =PU_6+sum(norm.logpdf(x[nonzerovarrow6], mean_6[nonzerovarrow6], var_6[nonzerovarrow6]))
  prob7 =PU_7+sum(norm.logpdf(x[nonzerovarrow7], mean_7[nonzerovarrow7], var_7[nonzerovarrow7]))
  prob8 =PU_8+sum(norm.logpdf(x[nonzerovarrow8], mean_8[nonzerovarrow8], var_8[nonzerovarrow8]))
  prob9 =PU_9+sum(norm.logpdf(x[nonzerovarrow9], mean_9[nonzerovarrow9], var_9[nonzerovarrow9])) 

  res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])

  if(res == train_labels[i]):
    accuracy += 1
    
  if i%1000 == 0:
    print(i)

print(accuracy/no_of_samples)
      


**Prediction on Testing Set Using Gausian on Untouched Images**

In [0]:

accuracy = 0
for i in range(0,no_of_tests):
  x = test_images[i,:]
  
  prob0 =PU_0+sum(norm.logpdf(x[nonzerovarrow0], mean_0[nonzerovarrow0], var_0[nonzerovarrow0]))
  prob1 =PU_1+sum(norm.logpdf(x[nonzerovarrow1], mean_1[nonzerovarrow1], var_1[nonzerovarrow1]))
  prob2 =PU_2+sum(norm.logpdf(x[nonzerovarrow2], mean_2[nonzerovarrow2], var_2[nonzerovarrow2]))
  prob3 =PU_3+sum(norm.logpdf(x[nonzerovarrow3], mean_3[nonzerovarrow3], var_3[nonzerovarrow3]))
  prob4 =PU_4+sum(norm.logpdf(x[nonzerovarrow4], mean_4[nonzerovarrow4], var_4[nonzerovarrow4]))
  prob5 =PU_5+sum(norm.logpdf(x[nonzerovarrow5], mean_5[nonzerovarrow5], var_5[nonzerovarrow5]))
  prob6 =PU_6+sum(norm.logpdf(x[nonzerovarrow6], mean_6[nonzerovarrow6], var_6[nonzerovarrow6]))
  prob7 =PU_7+sum(norm.logpdf(x[nonzerovarrow7], mean_7[nonzerovarrow7], var_7[nonzerovarrow7]))
  prob8 =PU_8+sum(norm.logpdf(x[nonzerovarrow8], mean_8[nonzerovarrow8], var_8[nonzerovarrow8]))
  prob9 =PU_9+sum(norm.logpdf(x[nonzerovarrow9], mean_9[nonzerovarrow9], var_9[nonzerovarrow9])) 

  res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])

  if(res == test_labels[i]):
    accuracy += 1
    
  if i%1000 == 0:
    print(i)

print(accuracy/no_of_tests)


**Prediction on Training Set Using Gausian on Stretched Images**

In [0]:
accuracy = 0
for i in range(0,no_of_samples):
  x = train_stretched[i,:]
  
  prob0 =PS_0+sum(norm.logpdf(x[nonzerovarrowS0], mean_0[nonzerovarrowS0], var_0[nonzerovarrowS0]))
  prob1 =PS_1+sum(norm.logpdf(x[nonzerovarrowS1], mean_1[nonzerovarrowS1], var_1[nonzerovarrowS1]))
  prob2 =PS_2+sum(norm.logpdf(x[nonzerovarrowS2], mean_2[nonzerovarrowS2], var_2[nonzerovarrowS2]))
  prob3 =PS_3+sum(norm.logpdf(x[nonzerovarrowS3], mean_3[nonzerovarrowS3], var_3[nonzerovarrowS3]))
  prob4 =PS_4+sum(norm.logpdf(x[nonzerovarrowS4], mean_4[nonzerovarrowS4], var_4[nonzerovarrowS4]))
  prob5 =PS_5+sum(norm.logpdf(x[nonzerovarrowS5], mean_5[nonzerovarrowS5], var_5[nonzerovarrowS5]))
  prob6 =PS_6+sum(norm.logpdf(x[nonzerovarrowS6], mean_6[nonzerovarrowS6], var_6[nonzerovarrowS6]))
  prob7 =PS_7+sum(norm.logpdf(x[nonzerovarrowS7], mean_7[nonzerovarrowS7], var_7[nonzerovarrowS7]))
  prob8 =PS_8+sum(norm.logpdf(x[nonzerovarrowS8], mean_8[nonzerovarrowS8], var_8[nonzerovarrowS8]))
  prob9 =PS_9+sum(norm.logpdf(x[nonzerovarrowS9], mean_9[nonzerovarrowS9], var_9[nonzerovarrowS9])) 
  
  res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])

  if(res == train_labels[i]):
    accuracy += 1
    
  if i%1000 == 0:
    print(i)

print(accuracy/no_of_samples)
      


**Prediction on Testing Set Using Gausian on Stretched Images**

In [0]:
accuracy = 0
for i in range(0,no_of_tests):
  x = test_stretched[i,:]
  
  prob0 =PS_0+sum(norm.logpdf(x[nonzerovarrowS0], meanS_0[nonzerovarrowS0], varS_0[nonzerovarrowS0]))
  prob1 =PS_1+sum(norm.logpdf(x[nonzerovarrowS1], meanS_1[nonzerovarrowS1], varS_1[nonzerovarrowS1]))
  prob2 =PS_2+sum(norm.logpdf(x[nonzerovarrowS2], meanS_2[nonzerovarrowS2], varS_2[nonzerovarrowS2]))
  prob3 =PS_3+sum(norm.logpdf(x[nonzerovarrowS3], meanS_3[nonzerovarrowS3], varS_3[nonzerovarrowS3]))
  prob4 =PS_4+sum(norm.logpdf(x[nonzerovarrowS4], meanS_4[nonzerovarrowS4], varS_4[nonzerovarrowS4]))
  prob5 =PS_5+sum(norm.logpdf(x[nonzerovarrowS5], meanS_5[nonzerovarrowS5], varS_5[nonzerovarrowS5]))
  prob6 =PS_6+sum(norm.logpdf(x[nonzerovarrowS6], meanS_6[nonzerovarrowS6], varS_6[nonzerovarrowS6]))
  prob7 =PS_7+sum(norm.logpdf(x[nonzerovarrowS7], meanS_7[nonzerovarrowS7], varS_7[nonzerovarrowS7]))
  prob8 =PS_8+sum(norm.logpdf(x[nonzerovarrowS8], meanS_8[nonzerovarrowS8], varS_8[nonzerovarrowS8]))
  prob9 =PS_9+sum(norm.logpdf(x[nonzerovarrowS9], meanS_9[nonzerovarrowS9], varS_9[nonzerovarrowS9])) 
  
  res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])

  if(res == test_labels[i]):
    accuracy += 1
    
  if i%1000 == 0:
    print(i)

print(accuracy/no_of_tests)
      


**Apply Threshold On Untouched Training Data**

In [0]:
Threshold_train_images = 