In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf # TtensorFlow
import matplotlib.pyplot as plt # MATLAB-like plotting framework

# Set ouput of plotting commands directly below the code cell that produced it.
%matplotlib inline 


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
# Load the training dataset from train.csv. 
# Shuffle the examples and divide them into train, dev and test in the ratio 8:1:1.

raw_dataset = pd.read_csv("../input/train.csv").values
np.random.shuffle(raw_dataset)
all_labels = raw_dataset[:, 0]
all_features = raw_dataset[:, 1:] / 255 # Normalizes the pixel values.

num_examples = all_labels.size
a, b = num_examples * 8 // 10, num_examples * 9 // 10 # Ratios for the dataset split.

dataset = {
    'train_labels': all_labels[:a],
    'dev_labels': all_labels[a:b],
    'test_labels': all_labels[b:],
    'train_features': all_features[:a, :],
    'dev_features': all_features[a:b, :],
    'test_features': all_features[b:, :]
}

In [None]:
# Check what a digit from the test_features division looks like:

def plot_digit(pixels):
    plt.imshow(pixels.reshape(28, 28))
    plt.show()
    
plot_digit(dataset['test_features'][0, :])

**About the Model**

The model consists of a single hidden layer with 200 units using ReLU and an output layer using Softmax.
It has no regularization but uses the Adam optimizer and trains on the entire batch.

In [None]:
# Creates placeholder tensors for features and labels, as well as a set of 'Y' labels with
# one-hot encoding to be used in costs calculation.

def input_nodes(num_features, num_labels):
    X = tf.placeholder(tf.float32, shape = [None, num_features], name = 'X')
    labels = tf.placeholder(tf.int64, shape = [None], name = 'labels')
    
    with tf.name_scope('labels_to_Y'):
        Y = tf.one_hot(labels, num_labels, name = 'Y')
        
    return X, labels, Y
    

In [None]:
# Creates the Linear layer. Uses Xavier initializer for weight initialization.
# This automatically determines the scale of initialization based on the number of input
# and output neurons.

def linear_layer(input_, in_size, out_size, name):
    with tf.variable_scope(name):
        W = tf.get_variable('W', 
                            shape = [out_size, in_size], 
                            initializer = tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('b', 
                            shape = [out_size],
                            initializer = tf.zeros_initializer())
        
    return tf.matmul(input_, W, transpose_b = True) + b

In [None]:
# Creates the ReLU layer from a Linear layer created from the given inputs.

def relu_layer(input_, in_size, out_size, name):
    return tf.nn.relu(linear_layer(input_, in_size, out_size, name))

In [None]:
# Carries out forward propagation for each of the Linear layers,
# then the final ReLU layer.

def forward_prop(X, layers):
    for l in range(1, len(layers) - 1):
        with tf.name_scope('relu_layer' + str(l)):
            X = relu_layer(X, layers[l - 1], layers[l], 'weights' + str(l))
    l = len(layers) - 1
    with tf.name_scope('linear_layer'):
        X = linear_layer(X, layers[l - 1], layers[l], 'weights' + str(l))
    return X

In [None]:
# Calculates the cost function for the given logits (predictions) and labels (actuals).
# Uses softmax cross entropy as the measure of probability error, used in classification
# tasks with mutually exclusive classes (can't be more than one digit).

def cost_function(logits, labels):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits (logits = logits,
                                                                  labels = labels))

In [None]:
# Returns the accuracy of the predictions.

def logits_to_labels(logits, num_labels):
    return tf.argmax(logits, axis = 1)

def accuracy(predictions, labels):
    correct = tf.equal(predictions, labels)
    acc = tf.reduce_mean(tf.cast(correct, tf.float32))
    return acc

In [None]:
# Create and train the model!

def model(X_train, labels_train, X_dev, labels_dev, layers = [784, 200, 10], num_epochs = 100, 
          learning_rate = 0.01, writer_dir = 'tensorboard/model1/default', 
          checkpoint = 'checkpoints/model.ckpt'):
    tf.reset_default_graph()
    
    X, labels, Y = input_nodes(layers[0], layers[-1]) # placeholder tensors
    
    # Forward prop:
    with tf.name_scope('forward_prop'):
        logits = forward_prop(X, layers)
    # Cost:
    with tf.name_scope('cost'):
        cost = cost_function(logits, Y)
    # Accuracy of predictions:
    with tf.name_scope('accuracy'):
        acc = accuracy(logits_to_labels(logits, layers[-1]), labels)
    # Summaries of cost and accuracy as scalar values:
    with tf.name_scope('summaries'):
        tf.summary.scalar('cost', cost)
        tf.summary.scalar('accuracy', acc)
        
    # Constucts a new Adam optimizer and trains using the learning rate and cost function
    with tf.name_scope('optimizer'):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
        
    # Merge all summaries collected in the default graph.
    with tf.name_scope('summaries'):
        merge = tf.summary.merge_all()
        
    init = tf.global_variables_initializer() # Op to initialize global variables in the graph
    saver = tf.train.Saver() # Op to save and restore variables to and from checkpoints
    
    # Summary writers, create an event file for each dataset in a given directory 
    # and add summaries and events to it
    graph_writer = tf.summary.FileWriter(writer_dir + '/graph')
    train_writer = tf.summary.FileWriter(writer_dir + '/train')
    dev_writer = tf.summary.FileWriter(writer_dir + '/dev')
    
    # Session setup:
    with tf.Session() as sess:
        graph_writer.add_graph(sess.graph)
        sess.run(init)
        
        # Loop through the epochs:
        for i in range(1, num_epochs + 1):
            summary, _, cost_val = sess.run([merge, train_step, cost], {X: X_train, labels: labels_train})
            train_writer.add_summary(summary, i)
            if i % 10 == 0:
                print('{}. iteration: train cost = {}'.format(i, cost_val))
            
            summary, cost_val = sess.run([merge, cost], { X: X_dev, labels: labels_dev })
            dev_writer.add_summary(summary, i)
            if i % 10 == 0:
                print('dev cost = {}'.format(cost_val))
        saver.save(sess, checkpoint)
    
    graph_writer.close()
    train_writer.close()
    dev_writer.close()


In [None]:
# Trains the model, calculating the costs on the training and dev sets:

model(dataset['train_features'], dataset['train_labels'], dataset['dev_features'], 
      dataset['dev_labels'], num_epochs = 200, writer_dir = 'tensorboard/model1/1')

In [None]:
# Generates a submission CSV containing predictions on the test dataset:

def generate_submission(checkpoint, layers):
    tf.reset_default_graph()
    challenge = pd.read_csv('../input/test.csv').values
    X = tf.placeholder(tf.float32, [None, layers[0]])
    logits = forward_prop(X, layers)
    pred = logits_to_labels(logits, layers[-1])
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        pred = pred.eval({X: challenge})
    df = pd.DataFrame(data = list(zip(range(1, pred.size+1), pred)), 
                      columns = ['ImageId', 'Label'])
    return df

In [None]:
df = generate_submission('checkpoints/model.ckpt', [784, 200, 10])
df.to_csv('submission.csv', index = False, header = True)
df