In [0]:
import os,urllib.request
datapath = '../Data/MNISTData/'
if not os.path.exists(datapath):
    os.makedirs(datapath)

urls = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
       'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

for url in urls:
    filename = url.split('/')[-1]
    if os.path.exists(datapath+filename):
        print(filename, ' already exists')
    else:
        print('Downloading ',filename)
        urllib.request.urlretrieve (url, datapath+filename)
     
print('All files are downloaded for MNIST dataset')


Downloading  train-images-idx3-ubyte.gz
Downloading  train-labels-idx1-ubyte.gz
Downloading  t10k-images-idx3-ubyte.gz
Downloading  t10k-labels-idx1-ubyte.gz
All files are downloaded for MNIST dataset


In [0]:
import os,gzip,shutil

datapath = '../Data/MNISTData/'  
files = os.listdir(datapath)
for file in files:
    if file.endswith('gz'):
        print('Extracting ',file)
        with gzip.open(datapath+file, 'rb') as f_in:
            with open(datapath+file.split('.')[0], 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
print('Extraction Complete')

for file in files:
    print('Removing ',file)
    os.remove(datapath+file)
print ('All archives removed')

Extracting  train-images-idx3-ubyte.gz
Extracting  t10k-labels-idx1-ubyte.gz
Extracting  train-labels-idx1-ubyte.gz
Extracting  t10k-images-idx3-ubyte.gz
Extraction Complete
Removing  train-images-idx3-ubyte.gz
Removing  t10k-labels-idx1-ubyte.gz
Removing  train-labels-idx1-ubyte.gz
Removing  t10k-images-idx3-ubyte.gz
All archives removed


In [0]:
import os,codecs
import numpy as np

datapath = '../Data/MNISTData/'
files = os.listdir(datapath)

def get_int(b):
  return int(codecs.encode(b, 'hex'), 16)

data_dict = {}
for file in files:
    if file.endswith('ubyte'):
        print('Reading ',file)
        with open (datapath+file,'rb') as f:
            data = f.read()
            type = get_int(data[:4])
            length = get_int(data[4:8])
            if (type == 2051):
                category = 'images'
                num_rows = get_int(data[8:12])
                num_cols = get_int(data[12:16])
                parsed = np.frombuffer(data,dtype = np.uint8, offset = 16)
                parsed = parsed.reshape(length,num_rows*num_cols)
            elif(type == 2049):
                category = 'labels'
                parsed = np.frombuffer(data, dtype=np.uint8, offset=8)
                parsed = parsed.reshape(length)
            if (length==10000):
                set = 'test'
            elif (length==60000):
                set = 'train'
            data_dict[set+'_'+category] = parsed

print(data_dict.keys())

Reading  t10k-labels-idx1-ubyte
Reading  train-labels-idx1-ubyte
Reading  t10k-images-idx3-ubyte
Reading  train-images-idx3-ubyte
dict_keys(['test_labels', 'train_labels', 'test_images', 'train_images'])


In [0]:
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import cv2

train_images = data_dict['train_images']
train_labels = data_dict['train_labels']

train0 = train_images[np.where(train_labels[:] == 0)]
train1 = train_images[np.where(train_labels[:] == 1)]
train2 = train_images[np.where(train_labels[:] == 2)]
train3 = train_images[np.where(train_labels[:] == 3)]
train4 = train_images[np.where(train_labels[:] == 4)]
train5 = train_images[np.where(train_labels[:] == 5)]
train6 = train_images[np.where(train_labels[:] == 6)]
train7 = train_images[np.where(train_labels[:] == 7)]
train8 = train_images[np.where(train_labels[:] == 8)]
train9 = train_images[np.where(train_labels[:] == 9)]

l = train_images.shape[0]

STRETCHED_TRAIN_IMAGES = train_images.reshape(l, 28, 28)

for i in range (l):
  
  min_j, min_k, max_j, max_k = 255,255,0,0;
  y=[]

  for j in range (28):
    for k in range (28):
      if(STRETCHED_TRAIN_IMAGES[i][j][k] > 0):
        if(j < min_j):
          min_j = j
        if(j > max_j):
           max_j = j
        if(k < min_k):
          min_k = k
        if(k > max_k):
          max_k = k
  
  for j in range (min_j, max_j+1):
    x=[]
    for k in range (min_k, max_k+1):
       x.append(STRETCHED_TRAIN_IMAGES[i][j][k])
    y.append(x)
  
  arr = np.array(y)
  resized = cv2.resize(arr, (20,20), interpolation = cv2.INTER_NEAREST)

  #img = Image.fromarray(resized)
  #THIS FUNCTION NOT WORKING
  #img.resize((20,20))
  #a = np.asarray(img)
  #print(np.size(a))
  #plt.imshow(img)
  
  if i == 0:
    each_image = resized.flatten().reshape(1,400)
  else:
    each_image = np.append(each_image, resized.flatten().reshape(1,400), axis = 0)
  
train0_stretched = each_image[np.where(train_labels[:] == 0)]
train1_stretched = each_image[np.where(train_labels[:] == 1)]
train2_stretched = each_image[np.where(train_labels[:] == 2)]
train3_stretched = each_image[np.where(train_labels[:] == 3)]
train4_stretched = each_image[np.where(train_labels[:] == 4)]
train5_stretched = each_image[np.where(train_labels[:] == 5)]
train6_stretched = each_image[np.where(train_labels[:] == 6)]
train7_stretched = each_image[np.where(train_labels[:] == 7)]
train8_stretched = each_image[np.where(train_labels[:] == 8)]
train9_stretched = each_image[np.where(train_labels[:] == 9)]
    
    
print('Untouched and stretched images generated successfully !')


[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ... 42  0  0]]
Untouched and stretched images generated successfully !


In [0]:
tim = data_dict['test_images']
trl = data_dict['test_labels']

l = tim.shape[0]

STRETCHED_TEST_IMAGES = tim.reshape(l, 28, 28)

for i in range (l):
  
  min_j, min_k, max_j, max_k = 255,255,0,0;
  y=[]

  for j in range (28):
    for k in range (28):
      if(STRETCHED_TEST_IMAGES[i][j][k] > 0):
        if(j < min_j):
          min_j = j
        if(j > max_j):
           max_j = j
        if(k < min_k):
          min_k = k
        if(k > max_k):
          max_k = k
  for j in range (min_j, max_j+1):
    x=[]
    for k in range (min_k, max_k+1):
       x.append(STRETCHED_TEST_IMAGES[i][j][k])
    y.append(x)
  
  #print(y)
  #print(np.size(np.asarray(y)))
  
  img = Image.fromarray(np.asarray(y))
  #THIS FUNCTION NOT WORKING
  img.resize((20,20))
  #a = np.asarray(img)
  #print(np.size(a))
  plt.imshow(img)
  
  if i == 0:
    test_stretched = np.asarray(img).flatten().reshape(400,1)
  else:
    np.append(test_stretched, np.asarray(img).flatten().reshape(400,1), axis = 0)
  
  #if np.size(np.asarray(y)) <400:
    #break




In [0]:
no_of_samples = train_images.shape[0]
image_size = np.size(train0[0])

P_0 = np.log((np.size(train0)/image_size)/no_of_samples)
P_1 = np.log((np.size(train1)/image_size)/no_of_samples)
P_2 = np.log((np.size(train2)/image_size)/no_of_samples)
P_3 = np.log((np.size(train3)/image_size)/no_of_samples)
P_4 = np.log((np.size(train4)/image_size)/no_of_samples)
P_5 = np.log((np.size(train5)/image_size)/no_of_samples)
P_6 = np.log((np.size(train6)/image_size)/no_of_samples)
P_7 = np.log((np.size(train7)/image_size)/no_of_samples)
P_8 = np.log((np.size(train8)/image_size)/no_of_samples)
P_9 = np.log((np.size(train9)/image_size)/no_of_samples)


print('Successfully generated probabilities !')

In [0]:

def generateMeanVar(train0, train1, train2, train3, train4, train5, train6, train7, train8, train9): 
  mean_0 = (np.mean(train0, axis =0))
  mean_1 = (np.mean(train1, axis =0))
  mean_2 = (np.mean(train2, axis =0))
  mean_3 = (np.mean(train3, axis =0))
  mean_4 = (np.mean(train4, axis =0))
  mean_5 = (np.mean(train5, axis =0))
  mean_6 = (np.mean(train6, axis =0))
  mean_7 = (np.mean(train7, axis =0))
  mean_8 = (np.mean(train8, axis =0))
  mean_9 = (np.mean(train9, axis =0))
  var_0 = (np.std(train0, axis =0))
  var_1 = (np.std(train1, axis =0))
  var_2 = (np.std(train2, axis =0))
  var_3 = (np.std(train3, axis =0))
  var_4 = (np.std(train4, axis =0))
  var_5 = (np.std(train5, axis =0))
  var_6 = (np.std(train6, axis =0))
  var_7 = (np.std(train7, axis =0))
  var_8 = (np.std(train8, axis =0))
  var_9 = (np.std(train9, axis =0))

  nonzerovarrow0 = np.where(var_0[:]!=0) 
  nonzerovarrow1 = np.where(var_1[:]!=0) 
  nonzerovarrow2 = np.where(var_2[:]!=0) 
  nonzerovarrow3 = np.where(var_3[:]!=0) 
  nonzerovarrow4 = np.where(var_4[:]!=0) 
  nonzerovarrow5 = np.where(var_5[:]!=0) 
  nonzerovarrow6 = np.where(var_6[:]!=0) 
  nonzerovarrow7 = np.where(var_7[:]!=0) 
  nonzerovarrow8 = np.where(var_8[:]!=0)
  nonzerovarrow9 = np.where(var_9[:]!=0) 
  
  print('Mean and Variance generated successfully !')
  
  return (mean_0, mean_1, mean_2, mean_3, mean_4, mean_5, mean_6, mean_7, mean_8, mean_9, 
          var_0, var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9,
         [nonzerovarrow0,nonzerovarrow1,nonzerovarrow2,nonzerovarrow3,nonzerovarrow4,
          nonzerovarrow5,nonzerovarrow6,nonzerovarrow7,nonzerovarrow8,nonzerovarrow9])


    
  

In [0]:
from scipy.stats import norm

test_images = data_dict['test_images']
test_labels = data_dict['test_labels']
no_of_tests = test_images.shape[0]


def getHighestProbability(probs):
  max = probs[0]
  res = 0
  for i in range(1,10):
    if probs[i] > max:
      max = probs[i]
      res = i
  return res

def getAccuracy(TEST_IMAGES, test_labels, distribution, mean_0, mean_1, mean_2, mean_3, mean_4, mean_5, mean_6, mean_7, mean_8, mean_9, 
                var_0, var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, P,nonzerovarrow):
  accuracy = 0
  for i in range(0,no_of_tests):
    x = TEST_IMAGES[i,:]
    
    if(distribution == 'normal'):
      prob0 =P[0]+sum(norm.logpdf(x[nonzerovarrow[0]], mean_0[nonzerovarrow[0]], var_0[nonzerovarrow[0]]))
      prob1 =P[1]+sum(norm.logpdf(x[nonzerovarrow[1]], mean_1[nonzerovarrow[1]], var_1[nonzerovarrow[1]]))
      prob2 =P[2]+sum(norm.logpdf(x[nonzerovarrow[2]], mean_2[nonzerovarrow[2]], var_2[nonzerovarrow[2]]))
      prob3 =P[3]+sum(norm.logpdf(x[nonzerovarrow[3]], mean_3[nonzerovarrow[3]], var_3[nonzerovarrow[3]]))
      prob4 =P[4]+sum(norm.logpdf(x[nonzerovarrow[4]], mean_4[nonzerovarrow[4]], var_4[nonzerovarrow[4]]))
      prob5 =P[5]+sum(norm.logpdf(x[nonzerovarrow[5]], mean_5[nonzerovarrow[5]], var_5[nonzerovarrow[5]]))
      prob6 =P[6]+sum(norm.logpdf(x[nonzerovarrow[6]], mean_6[nonzerovarrow[6]], var_6[nonzerovarrow[6]]))
      prob7 =P[7]+sum(norm.logpdf(x[nonzerovarrow[7]], mean_7[nonzerovarrow[7]], var_7[nonzerovarrow[7]]))
      prob8 =P[8]+sum(norm.logpdf(x[nonzerovarrow[8]], mean_8[nonzerovarrow[8]], var_8[nonzerovarrow[8]]))
      prob9 =P[9]+sum(norm.logpdf(x[nonzerovarrow[9]], mean_9[nonzerovarrow[9]], var_9[nonzerovarrow[9]])) 

    res = getHighestProbability([prob0, prob1, prob2, prob3, prob4, prob5, prob6, prob7, prob8, prob9])

    if(res == test_labels[i]):
      accuracy += 1
      
    if i%1000 == 0:
      print(i)
      
  return accuracy/no_of_tests

(mean_0, mean_1, mean_2, mean_3, mean_4, mean_5, mean_6, mean_7, mean_8, mean_9, var_0, var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, nonzerovarrow) = generateMeanVar(train0, train1, train2, train3, train4, train5, train6, train7, train8, train9)
accuracy = getAccuracy(test_images, test_labels, 'normal', mean_0, mean_1, mean_2, mean_3, mean_4, mean_5, mean_6, mean_7, mean_8, mean_9, 
                       var_0, var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, 
                       [P_0, P_1, P_2, P_3, P_4, P_5, P_6, P_7, P_8, P_9], nonzerovarrow)
print('Accuracy: ', accuracy)
      


In [0]:


def generateShape(d, n):
  d.shape=(n, n)
  plt.imshow(d,cmap='gray')
  plt.show()
    
generateShape(mean_0, 28)
generateShape(mean_1, 28)
generateShape(mean_2, 28)
generateShape(mean_3, 28)
generateShape(mean_4, 28)
generateShape(mean_5, 28)
generateShape(mean_6, 28)
generateShape(mean_7, 28)
generateShape(mean_8, 28)
generateShape(mean_9, 28)  