In [43]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from six.moves import cPickle as pickle

In [14]:
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.
train_folder_name = "notMNIST_large"
test_folder_name = "notMNIST_small"
letters = ['A','B','C','D','E','F','G','H','I','J']
train_pickle_filenames = []
test_pickle_filenames = []


for letter in letters:
    train_pickle_filenames.append(train_folder_name + '/' + letter + '.pickle')
    test_pickle_filenames.append(test_folder_name + '/' + letter + '.pickle')

def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    print('Processing ' + str(label))
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            

In [19]:
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_pickle_filenames, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_pickle_filenames, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Processing 0
Processing 1
Processing 2
Processing 3
Processing 4
Processing 5
Processing 6
Processing 7
Processing 8
Processing 9
Processing 0
Processing 1
Processing 2
Processing 3
Processing 4
Processing 5
Processing 6
Processing 7
Processing 8
Processing 9
('Training:', (200000, 28, 28), (200000,))
('Validation:', (10000, 28, 28), (10000,))
('Testing:', (10000, 28, 28), (10000,))


In [24]:
mod_train = np.reshape(train_dataset,(train_size, (28*28)))

In [31]:
mod_val = np.reshape(valid_dataset,(valid_size, (28*28)))

In [32]:
mod_test = np.reshape(test_dataset,(test_size, (28*28)))

In [40]:
lr = LogisticRegression()
lr.fit(mod_train,train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
preds = lr.predict(mod_test)

In [45]:
metrics.confusion_matrix(preds, test_labels)



array([[900,   7,   2,   8,   6,   5,  10,  27,  17,   7],
       [  3, 882,   3,  16,  27,   2,  11,  10,   0,   4],
       [  8,   7, 934,   6,  43,   7,  37,   4,   4,   5],
       [  5,  34,   2, 920,   4,   1,  11,   6,  10,   5],
       [  8,  13,  12,   5, 844,   5,   6,  19,  12,   3],
       [  7,  17,  13,  13,  21, 930,  15,   8,  23,  22],
       [  9,   8,  15,   4,  15,   5, 876,   8,  14,   6],
       [ 28,   7,   6,   5,   7,   4,   6, 881,  13,   3],
       [ 10,  15,   8,  14,  26,  16,  17,  25, 848,  25],
       [ 22,  10,   5,   9,   7,  25,  11,  12,  59, 920]])

In [47]:
print(metrics.classification_report(preds, test_labels))

             precision    recall  f1-score   support

          0       0.90      0.91      0.90       989
          1       0.88      0.92      0.90       958
          2       0.93      0.89      0.91      1055
          3       0.92      0.92      0.92       998
          4       0.84      0.91      0.88       927
          5       0.93      0.87      0.90      1069
          6       0.88      0.91      0.89       960
          7       0.88      0.92      0.90       960
          8       0.85      0.84      0.85      1004
          9       0.92      0.85      0.88      1080

avg / total       0.89      0.89      0.89     10000

