In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = '.' # Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

Found and verified ./notMNIST_large.tar.gz
Found and verified ./notMNIST_small.tar.gz


In [3]:
num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

./notMNIST_large already present - Skipping extraction of ./notMNIST_large.tar.gz.
['./notMNIST_large/A', './notMNIST_large/B', './notMNIST_large/C', './notMNIST_large/D', './notMNIST_large/E', './notMNIST_large/F', './notMNIST_large/G', './notMNIST_large/H', './notMNIST_large/I', './notMNIST_large/J']
./notMNIST_small already present - Skipping extraction of ./notMNIST_small.tar.gz.
['./notMNIST_small/A', './notMNIST_small/B', './notMNIST_small/C', './notMNIST_small/D', './notMNIST_small/E', './notMNIST_small/F', './notMNIST_small/G', './notMNIST_small/H', './notMNIST_small/I', './notMNIST_small/J']


In [4]:
# This Section defines two functions. the first function creates an array of images using all the image files 
# in a directory. The second function outputs a single random image seleced from a directory.

from os import listdir;
from PIL import Image;
import numpy as np

def imarray(impath):
# this function will output an array of sample images
    imdir = listdir(impath);
    imlist = np.zeros(len(imdir));
    tick = 0
    for im in imdir:
        try:
            imtemp = Image.open(impath + im);
            imcopy = imtemp.copy();
            imlist[tick] = imcopy;
            imtemp.close();
            tick += 1;
        except OSError:
            pass
    return imlist

def randomim(impath):
# this function will output a single sample image from a directory
    imdir = listdir(impath);
    k = np.random.randint(0,len(imdir),size=1);
    try:
        imtemp = Image.open(impath + imdir[k[0]]);
        imcopy = imtemp.copy();
        imtemp.close()
    except OSError:
        randomim(impath);
    return imcopy
        
    
trial = input('sample original images? (Y/N)');
if trial == 'Y':
    for i in range (ord('A'),ord('A')+10):
        impath = '/Users/Shardul/Desktop/Python/Projects/Untitled Folder/notMNIST_large/'+chr(i)+'/';
        im = randomim(impath);
        im.show();



sample original images? (Y/N)N


In [5]:
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (ndimage.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

./notMNIST_large/A.pickle already present - Skipping pickling.
./notMNIST_large/B.pickle already present - Skipping pickling.
./notMNIST_large/C.pickle already present - Skipping pickling.
./notMNIST_large/D.pickle already present - Skipping pickling.
./notMNIST_large/E.pickle already present - Skipping pickling.
./notMNIST_large/F.pickle already present - Skipping pickling.
./notMNIST_large/G.pickle already present - Skipping pickling.
./notMNIST_large/H.pickle already present - Skipping pickling.
./notMNIST_large/I.pickle already present - Skipping pickling.
./notMNIST_large/J.pickle already present - Skipping pickling.
./notMNIST_small/A.pickle already present - Skipping pickling.
./notMNIST_small/B.pickle already present - Skipping pickling.
./notMNIST_small/C.pickle already present - Skipping pickling.
./notMNIST_small/D.pickle already present - Skipping pickling.
./notMNIST_small/E.pickle already present - Skipping pickling.
./notMNIST_small/F.pickle already present - Skipping pi

In [6]:
trial = input('sample refined images? (Y/N)');
if trial == 'Y':
    for filepath in train_datasets:
        f = open(filepath,'rb');
        P = pickle.load(f);
        r = np.random.randint(P.shape[0]);
        a = P[r,:,:];
        imgplot = plt.imshow(a);
        imgplot.set_cmap('Greys');
        plt.colorbar();
        print('\n\nLetter',str(filepath)[17],', sample #',r)
        plt.show();
    

sample refined images? (Y/N)N


In [7]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Training: (200000, 28, 28) (200000,)
Validation: (10000, 28, 28) (10000,)
Testing: (10000, 28, 28) (10000,)


In [8]:
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [9]:
error = 0
for i in range(0,99000):
    a,b = np.sum(train_labels[i:i+1000])/1000,np.std(train_labels[i:i+1000]);
    if (4.3<=a<=4.7 or 2.7<=b<=3.0) == False:
        error +=1;
print('Number of suspected non-uniform intervals:',error);
    

Number of suspected non-uniform intervals: 0


In [10]:
pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [11]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

Compressed pickle size: 690800512


In [12]:
# Now we'll eliminate some of the repitition between the training dataset and the test and validation datasets
# Since comparing the data element wise is time and memory intensive, comparing a summation norm of each image
# will be compared. This may eliminate extra images but it will certainly eliminate all repetitions.

trainnorm = np.zeros(train_dataset.shape[0]);
tick = 0
for i in train_dataset:
    trainnorm[tick] = np.sum(i);
    tick += 1;
    
testnorm = np.zeros(test_dataset.shape[0]);
tick = 0
for i in test_dataset:
    testnorm[tick] = np.sum(i);
    tick += 1;
    
validnorm = np.zeros(valid_dataset.shape[0]);
tick = 0
for i in valid_dataset:
    validnorm[tick] = np.sum(i); 
    tick += 1;

In [13]:
# These are the indices of the repetitions of the validation and training norms in the training dataset.

indexarray = np.array([]);
tvintersect = np.intersect1d(trainnorm,validnorm);
indices = np.array([]);
for i in tvintersect:
    j = np.where(trainnorm==i);
    indices = np.append(indices,j[0]);

indices = indices.astype(int);


In [14]:
# train_modset is the training dataset with the overlaps with the validation set eliminated

trainrepetitions = train_dataset[indices];
train_modset = np.delete(train_dataset,indices,axis=0);
train_modlab = np.delete(train_labels,indices,axis=0);
train_modset.shape

(187820, 28, 28)

In [15]:
# these are the indices of the repetitions of the test and modified training norms in the modified training dataset

ftrainnorm = np.zeros(train_modset.shape[0]);
tick = 0
for i in train_modset:
    ftrainnorm[tick] = np.sum(i);  
    tick += 1;

indexarray = np.array([]);
ttintersect = np.intersect1d(ftrainnorm,testnorm);
indices = np.array([]);
for i in ttintersect:
    j = np.where(ftrainnorm==i);
    indices = np.append(indices,j[0]);

indices = indices.astype(int);

In [16]:
# train_finalset is the training dataset with all the overlap with the validation and testing sets eliminated

trainrepetitions = train_modset[indices];
train_finalset = np.delete(train_modset,indices,axis=0);
train_finallab = np.delete(train_modlab,indices,axis=0);
train_finalset.shape


(178977, 28, 28)

In [17]:
# No we check that there is no overlap in the norms of our final training dataset and the norms 
# of our validation and test datasets

finalnorm = np.zeros(train_finalset.shape[0]);
tick = 0
for i in train_finalset:
    finalnorm[tick] = np.sum(i); 
    tick += 1;
    
print(np.intersect1d(finalnorm,validnorm),np.intersect1d(finalnorm,testnorm));

[] []


In [18]:
# Now that all the data is properly edited we can start training our classifier on the data

# first we need to establish an initial state. We will draw weights from a gaussian distribution.

weights = np.random.normal(0,1/28,(10,28**2));


train_finallab.shape
train_finalset.shape

(178977, 28, 28)

In [19]:
def softmax(x):
        a = np.sum(np.exp(x), axis=0)
        b = np.exp(x) / a
        return b
    
def CrossEntropy(Soft,Label):
    D = -np.sum(np.multiply(Label,np.log(Soft)),axis=1);
    return D

In [20]:
# Test time for our softmax and cross entropy functions

Y = np.array([[0.1,0.1,0.2,0.6],[0.05,0.05,0.7,0.2],[0.8,0.01,0.01,0.18]]);

YS = softmax(Y);

L = np.zeros((3,4))

L[0,3]=1;
L[1,2]=1;
L[2,0]=1;

print(YS);
print(CrossEntropy(YS,L));

[[ 0.25220795  0.34902057  0.28771345  0.42967012]
 [ 0.23990762  0.33199864  0.47435928  0.28801649]
 [ 0.50788443  0.31898079  0.23792727  0.28231339]]
[ 0.84473753  0.74579026  0.67750135]


In [None]:
# Before implementing the model we have to transcribe the labels into the proper format to properly calculate cross-entropy

train_label_arrays = np.zeros((train_finallab.shape[0],10));

for i in range (0,train_label_arrays.shape[0]):
    train_label_arrays[i,train_finallab[i]]=1

for i in range (0,10):
    print(train_label_arrays[i*10000],train_finallab[i*10000]);


[ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.] 4
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.] 6
[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.] 1
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.] 7
[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.] 6
[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.] 1
[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.] 2
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.] 7
[ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.] 3
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.] 7


In [None]:
# This is the model based on sci kits off the shelf logistic classifier

X = train_finalset.reshape(train_finalset.shape[0],-1)

Model = LogisticRegression();
Model.fit(X,train_finallab);

In [None]:
Xprime = valid_dataset.reshape(valid_dataset.shape[0],784)

Validation_test_labels = Model.predict(Xprime)

In [None]:
tick = 0
for i in range (0,valid_dataset.shape[0]):
    if Validation_test_labels[i] == valid_labels[i]:
        tick += 1

print('current model accuracy =', tick/valid_dataset.shape[0])

In [None]:
coef, intercept = Model.coef_.copy(), Model.intercept_.copy()

f = open('optimal_coefficients.pickle', 'wb')
pickle.dump(coef, f)

f = open('optimal_intercept.pickle', 'wb')
pickle.dump(intercept, f)
