# Convolutional Neural Network for Handwritten Digit Classification

**Team : Swaggle**

**Members: **

*   Rashik Habib
*   Josh Lui
*   Daniel Lutes

(cell #0) Necessary imports

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from google.colab import files

import numpy as np
import scipy.misc
import matplotlib as plt
import tensorflow as tf
import cv2

tf.logging.set_verbosity(tf.logging.INFO)

(cell #1) Setup Google Drive with PyDrive to load data (if using Google Drive)

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

(cell #2a) Load datasets, semi-preprocess and store them in local variables (if loading from Koustuv's website)

In [0]:
# Loading Data

URL_ENDPOINT = "http://cs.mcgill.ca/~ksinha4/datasets/kaggle/"

train_x = np.loadtxt(URL_ENDPOINT+"train_x.csv", delimiter=",", dtype=int)
train_y = np.loadtxt(URL_ENDPOINT+"train_y.csv", delimiter=",", dtype=int)
test_x = np.loadtxt(URL_ENDPOINT+"test_x.csv", delimiter=",", dtype=int)

# Semi-preprocessing

from skimage import measure
from skimage import filters

pad_on = 1

separator_threshold = 235

# Pad the image with zeros to simplify DFS 
def pad_with(vector, pad_width, iaxis, kwargs):
    pad_value = kwargs.get('padder', 0)
    vector[:pad_width[0]] = pad_value
    vector[-pad_width[1]:] = pad_value
    return vector

# Return an integer giving the largest bounding square for a digit - O(64 x 64)
def bounding_square(image, digit_eq):
    min_x = 100000
    min_y = 100000
    max_x = -100000
    max_y = -100000

    for x in range(image.shape[1]):
        for y in range(image.shape[0]):
            
            if image[x][y] == digit_eq:
                
                if x < min_x:
                    min_x = x
                if x > max_x:
                    max_x = x
                if y < min_y:
                    min_y = y
                if y > max_y:
                    max_y = y
    
    
    if (max_x - min_x) > (max_y - min_y):
        return (max_x - min_x)**2
    else:
        return (max_y - min_y)**2
                
# Find the integer representative for the largest digit in the image
def largest_digit_eq(image):
    digit_eqs = np.unique(image)
    digits = digit_eqs.shape[0]     #includes the background (padding ensures 0th element represemts background)
    
    # error: no digits recognized
    if digits < 2:
        return -1
    
    # only 1 digit, must be the largest
    elif digits == 2:
        return digit_eqs[1]
    
    else:
        max_bounding_square = -1
        essential_digit = -1
        
        for digit_eq in range(1, digits):
            if  bounding_square(image, digit_eq) > max_bounding_square:
                max_bounding_square = bounding_square(image, digit_eq)
                essential_digit = digit_eq
            
    return essential_digit

# Apply the semi-preprocessing to the loaded examples 

train_x = train_x.reshape(-1, 64, 64)
test_x = test_x.reshape(-1, 64, 64)

for i, example_x in enumerate(train_x):
    if pad_on:
        example_x = np.pad(example_x, 1, pad_with) #becomes (66 x 66)
    
    # convert all digits above threshold to 255, and the rest to 0, labeling the connected components
    # essentially removes the backgound image, keeping only the handwritten digits
    example_x[example_x > separator_threshold] = 255
    example_x[example_x < separator_threshold] = 0
    example_x = measure.label(example_x, connectivity=1)
    
    # find the largest digit equivalent in the image and keep only its values
    # essentially removes digits which are irrelevant
    essential_digit = largest_digit_eq(example_x)
    example_x[example_x != essential_digit] = 0
    example_x[example_x == essential_digit] = 255

for i, example_x in enumerate(test_x):
    if pad_on:
        example_x = np.pad(example_x, 1, pad_with) #becomes (66 x 66)
    
    # convert all digits above threshold to 255, and the rest to 0, labeling the connected components
    # essentially removes the backgound image, keeping only the handwritten digits
    example_x[example_x > separator_threshold] = 255
    example_x[example_x < separator_threshold] = 0
    example_x = measure.label(example_x, connectivity=1)
    
    # find the largest digit equivalent in the image and keep only its values
    # essentially removes digits which are irrelevant
    essential_digit = largest_digit_eq(example_x)
    example_x[example_x != essential_digit] = 0
    example_x[example_x == essential_digit] = 255

(cell #2b) Further pre-process the data by cropping the largest digit, applying blur and rescaling to 28 x 28

In [0]:
def crop_minAreaRect(img, rect):

    # rotate img
    angle = rect[2]
    rows,cols = img.shape[0], img.shape[1]
    M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1)
    img_rot = cv2.warpAffine(img,M,(cols,rows))

    # rotate bounding box
    rect0 = (rect[0], rect[1], 0.0)
    box = cv2.boxPoints(rect)
    pts = np.int0(cv2.transform(np.array([box]), M))[0]    
    pts[pts < 0] = 0

    # crop
    img_crop = img_rot[pts[1][1]:pts[0][1], 
                       pts[1][0]:pts[2][0]]

    return img_crop

# test_x
master = np.ones((28,28), dtype=int)

for img in test_x:
    
    # blur the image
    kernel = np.ones((2,2),np.float32)/25
    dst = cv2.filter2D(img,-1,kernel)

    # find contours / rectangle
    _,contours,_ = cv2.findContours(dst, 1, 1)
    
    rect = cv2.minAreaRect(contours[-1])

    # crop
    dst_croped = crop_minAreaRect(dst, rect)
    
    # rotate and flip image if needed
    if dst_croped.shape[1] > dst_croped.shape[0]:
      dst_croped = dst_croped.T
      dst_croped = np.flipud(dst_croped)
    
    # resize image to 28x28
    dst_croped = cv2.resize(dst_croped, (28,28))
    
    master = np.concatenate((master, dst_croped), axis=0)

# remove the dummy ones placed at the start
test_x = master[28:, :]

# train_x
master = np.ones((28,28), dtype=int)

for img in train_x:
    
    # blur the image
    kernel = np.ones((2,2),np.float32)/25
    dst = cv2.filter2D(img,-1,kernel)

    # find contours / rectangle
    _,contours,_ = cv2.findContours(dst, 1, 1)
    
    rect = cv2.minAreaRect(contours[-1])

    # crop
    dst_croped = crop_minAreaRect(dst, rect)
    
    # rotate and flip image if needed
    if dst_croped.shape[1] > dst_croped.shape[0]:
      dst_croped = dst_croped.T
      dst_croped = np.flipud(dst_croped)
    
    # resize image to 28x28
    dst_croped = cv2.resize(dst_croped, (28,28))
    
    master = np.concatenate((master, dst_croped), axis=0)

# remove the dummy ones placed at the start
train_x = master[28:, :]

(cell #2c) Load the preprocessed files by ID and store them in local variables (if using Google Drive)

In [0]:
# resized : 1ay5-78XLNGEopXIg75Cazi3675tGnynA

trainset_x_id = "1ay5-78XLNGEopXIg75Cazi3675tGnynA"
downloaded = drive.CreateFile({'id': trainset_x_id})
downloaded.GetContentFile('resized_edited_train_x.csv') 
train_x = np.loadtxt('resized_edited_train_x.csv', delimiter=',', dtype=np.uint8)

trainset_y_id = '16mC77GcpoUb1KhGzAxfJMX2-kV1Mgl1E'
downloaded = drive.CreateFile({'id': trainset_y_id})
downloaded.GetContentFile('train_y.csv') 
train_y = np.loadtxt('train_y.csv', delimiter=',', dtype=np.int32)

(cell #3) Split the data (if validation required)

In [0]:
test_x = train_x[0:5000, :]
test_y = train_y[0:5000]
train_x = train_x[5000:50000, :]
train_y = train_y[5000:50000]


(cell #4) Convolutional Neural Network

In [0]:
"""
Best Performance: 92.5 %
Architecture: conv(32, 3x3) -> conv(32, 3x3) -> pool(2x2, 2) -> dropout(15%)
           -> conv(64, 5x5) -> conv(64, 5x5) -> pool(2x2, 2) -> dropout(15%)
           -> conv(128, 7x7) -> conv(128, 7x7) -> conv(128, 7x7) -> pool(2x2, 2) -> dropout(15%)
           -> flatten -> FC(128) -> FC(128) -> dropout(25%) -> logits
Batch size: 50
Steps: 20000
"""


def CNN(features, labels, mode):
    
    # Input Layer
    # Reshape training data to 4-D tensor and convert to float32
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
    input_layer = tf.cast(input_layer, tf.float32)
    
    # Convolutional Layers #1
    # Computes 32 features using two 3x3 filters with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=32,
      kernel_size=[3, 3],
      padding="same",
      activation=tf.nn.relu)
   
    conv2 = tf.layers.conv2d(
      inputs=conv1,
      filters=32,
      kernel_size=[3, 3],
      padding="same",
      activation=tf.nn.relu)
    
    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 28, 28, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 32]
    pool1 = tf.layers.max_pooling2d(
        inputs=conv2,
        pool_size=[2, 2],
        strides=2)
    
    # Add dropout operation; 0.85 probability that element will be kept
    dropout1 = tf.layers.dropout(
        inputs=pool1,
        rate=0.15,
        training=mode == tf.estimator.ModeKeys.TRAIN)
    
    # Convolutional Layers #2
    # Computes 64 features using two 5x5 filters.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv3 = tf.layers.conv2d(
        inputs=pool1,
        filters=64,
        kernel_size=[5, 5],
        padding="same",
        activation=tf.nn.relu)
    
    conv4 = tf.layers.conv2d(
        inputs=conv3,
        filters=64,
        kernel_size=[5, 5],
        padding="same",
        activation=tf.nn.relu)
    
    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 64]
    pool2 = tf.layers.max_pooling2d(
        inputs=conv4,
        pool_size=[2, 2],
        strides=2)
    
    # Add dropout operation; 0.85 probability that element will be kept
    dropout2 = tf.layers.dropout(
        inputs=pool2,
        rate=0.15,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    # Convolutional Layers #3
    # Computes 128 features using three 7x7 filters.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 7, 7, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 128]
    conv5 = tf.layers.conv2d(
        inputs=dropout2,
        filters=128,
        kernel_size=[7, 7],
        padding="same",
        activation=tf.nn.relu)
    
    conv6 = tf.layers.conv2d(
        inputs=conv5,
        filters=128,
        kernel_size=[7, 7],
        padding="same",
        activation=tf.nn.relu)
    
    conv7 = tf.layers.conv2d(
        inputs=conv6,
        filters=128,
        kernel_size=[7, 7],
        padding="same",
        activation=tf.nn.relu)
    
    # Pooling Layer #3
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 7, 7, 128]
    # Output Tensor Shape: [batch_size, 4, 4, 128]
    pool3 = tf.layers.max_pooling2d(
        inputs=conv7,
        pool_size=[2, 2],
        strides=2)
    
    # Add dropout operation; 0.85 probability that element will be kept
    dropout3 = tf.layers.dropout(
        inputs=pool3,
        rate=0.15,
        training=mode == tf.estimator.ModeKeys.TRAIN)
    
    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 4, 4, 128]
    # Output Tensor Shape: [batch_size, 4 * 4 * 128]
    pool3_flat = tf.reshape(dropout3, [-1, dropout3.shape[1] * dropout3.shape[2] * dropout3.shape[3]])
    
    # Dense Layer
    # Two Densely connected layers with 128 neurons each
    # Input Tensor Shape: [batch_size, 4 * 4 * 128]
    # Output Tensor Shape: [batch_size, 128]
    dense1 = tf.layers.dense(
        inputs=pool3_flat,
        units=128,
        activation=tf.nn.relu)
    
    dense2 = tf.layers.dense(
        inputs=dense1,
        units=128,
        activation=tf.nn.relu)
    
    # Add dropout operation; 0.75 probability that element will be kept
    dropout = tf.layers.dropout(
        inputs=dense2,
        rate=0.25,
        training=mode == tf.estimator.ModeKeys.TRAIN)
    
    # Logits layer
    # Input Tensor Shape: [batch_size, 128]
    # Output Tensor Shape: [batch_size, 10]
    logits = tf.layers.dense(inputs=dropout, units=10)
    
    
    predictions = {
    # Generate predictions (for PREDICT and EVAL mode)
    "classes": tf.argmax(input=logits, axis=1),
    # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
    # `logging_hook`.
    "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    # Calculate Loss (for both TRAIN and EVAL modes)
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    loss = tf.losses.softmax_cross_entropy(
        onehot_labels=onehot_labels, logits=logits)
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
      optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
      train_op = optimizer.minimize(
          loss=loss,
          global_step=tf.train.get_global_step())
      return tf.estimator.EstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op)
    
    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels=labels, predictions=predictions["classes"])}
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss,
        eval_metric_ops=eval_metric_ops)


train_x = train_x.reshape(-1, 28, 28)  

# Create the Estimator
mnist_classifier = tf.estimator.Estimator(
  model_fn=CNN)

"""
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
    tensors=tensors_to_log, every_n_iter=50)
"""
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": train_x},
    y=train_y,
    batch_size=50,
    num_epochs=None,
    shuffle=True)

mnist_classifier.train(
    input_fn=train_input_fn,
    steps=20000)


(cell #5) Evaluate the model and print results (if validation required)

In [0]:
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": test_x},
    y=test_y,
    num_epochs=1,
    shuffle=False)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

(cell #6) Import the test set images from Google Drive (if submitting to Kaggle and using Google Drive)

In [0]:
from google.colab import files

testset_x_id = "1JIFuYh2WuGfFy9IONaZx0puxfik9jfjF"
downloaded = drive.CreateFile({'id': testset_x_id})
downloaded.GetContentFile('resized_edited_test_x.csv') 
test_x = np.loadtxt('resized_edited_test_x.csv', delimiter=',', dtype=np.uint8)

test_x = test_x.reshape(-1, 28, 28)

(cell #7) Predict the results (if submitting to Kaggle)

In [0]:
# run the trained classifier model on the test set

eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": test_x},
    y=None,
    num_epochs=1,
    shuffle=False)
eval_results = mnist_classifier.predict(input_fn=eval_input_fn)

predictions = []
Id = []
i=0

for obj in eval_results:
  Id.append(i)
  predictions.append(obj["classes"])
  i = i+1

predictions = np.array(predictions)
predictions = predictions.reshape(predictions.shape[0], 1)
Id = np.array(Id)
Id = Id.reshape(Id.shape[0], 1)
answers = np.concatenate((Id, predictions), axis=1)

# output predictions to a file

with open('pred_test_y.csv', 'w') as f:
  np.savetxt(f, answers, delimiter=',', header="Id,Label", fmt='%d')

files.download('pred_test_y.csv')