# A little bit about Google Colaboratory

In [0]:
!cat /proc/meminfo

In [0]:
!cat /proc/cpuinfo

In [0]:
import tensorflow as tf
tf.test.gpu_device_name()

In [0]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

# Setup


In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [0]:
!git clone https://github.com/zalandoresearch/fashion-mnist

In [0]:
import os
import math
import sys

import numpy as np
import tensorflow as tf

## We will visualize the training with Tensorboard

In [0]:
TB_DIR = './Graph'

get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(TB_DIR)
)

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

## Downloading and understanding the dataset

In [0]:
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz --directory-prefix=data/fashion
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz --directory-prefix=data/fashion
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz --directory-prefix=data/fashion
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz --directory-prefix=data/fashion

In [0]:
sys.path.insert(0, 'fashion-mnist/utils')
import mnist_reader

X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

In [0]:
print(X_train.shape)

In [0]:
X_new = np.zeros((len(X_train), 28, 28))
for i in range(len((X_train))):
    X_new[i] = X_train[i].reshape([28,28])
X_train = X_new

X_new = np.zeros((len(X_test), 28, 28))
for i in range(len((X_test))):
    X_new[i] = X_test[i].reshape([28,28])
X_test = X_new

In [0]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Image data

In [0]:
print(np.max(X_train[0]), np.min(X_train[0]), np.mean(X_train), np.std(X_train))

### Labels


Each training and test example is assigned to one of the following labels:

| Label | Description |
| --- | --- |
| 0 | T-shirt/top |
| 1 | Trouser |
| 2 | Pullover |
| 3 | Dress |
| 4 | Coat |
| 5 | Sandal |
| 6 | Shirt |
| 7 | Sneaker |
| 8 | Bag |
| 9 | Ankle boot |

In [0]:
int_to_lbl = {
    0 : 'T-shirt/top',
    1 : 'Trouser',
    2 : 'Pullover',
    3 : 'Dress',
    4 : 'Coat',
    5 : 'Sandal',
    6 : 'Shirt',
    7 : 'Sneaker',
    8 : 'Bag',
    9 : 'Angle Boot'
}

print(set(y_train))

### Visualizing images

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import random


for i in range(1, 5):
    idx = random.randint(0, len(X_train))
    plt.subplot(220+i)
    plt.imshow(255-X_train[idx].reshape((28,28,)),
               cmap=plt.get_cmap('gray'), origin='upper')
    plt.setp(plt.title(int_to_lbl[y_train[idx]]), color='b') 

    plt.grid(None)
    plt.axis('off')

# Code

In [0]:
!git clone https://github.com/zalandoresearch/fashion-mnist

import math
import os
import sys
sys.path.insert(0, 'fashion-mnist/utils')

import numpy as np
import tensorflow as tf

import mnist_reader

In [0]:
TB_DIR = './Graph'
tf.set_random_seed(0)

def get_data():
    X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
    X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')
        
    return X_train, y_train, X_test, y_test

  
def make_input_fn(X, y, batch_size, mode):
    def preprocess(image, label):
        image = (image-72.9403)/90.0211
        image = tf.reshape(image, shape=(28,28,1))
        return image, label
      
    def _input_fn():
        
        ds = tf.data.Dataset.from_tensor_slices((
            tf.cast(X, tf.float32),
            tf.cast(y, tf.int32)))
                    
        if mode == tf.estimator.ModeKeys.TRAIN:
            ds = ds.apply(tf.data.experimental.shuffle_and_repeat(
                buffer_size=3*batch_size, count=None))
        else:
            ds = ds.repeat(1)
            
        ds = ds.apply(tf.data.experimental.map_and_batch(
             map_func=preprocess, batch_size=batch_size))
        ds = ds.prefetch(buffer_size=5000)
                      
        return ds
    return _input_fn


def train_and_evaluate(model_dir, hparams, model_fn_for_train):
    
    estimator = tf.estimator.Estimator(
        model_fn = model_fn_for_train,
        params = hparams,
        config= tf.estimator.RunConfig(
            save_checkpoints_steps = 2000,
            log_step_count_steps=1000
            ),
        model_dir = model_dir)
    
    X_train, y_train, X_test, y_test = get_data()

    train_spec = tf.estimator.TrainSpec(
        input_fn = make_input_fn(
            X_train, y_train,
            hparams['batch_size'],
            mode = tf.estimator.ModeKeys.TRAIN),
        max_steps = (50000//hparams["batch_size"]) * hparams["epochs"])
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn = make_input_fn(
            X_test, y_test,
            hparams['batch_size'],
            mode = tf.estimator.ModeKeys.EVAL
        ),
        start_delay_secs = 1,
        throttle_secs = 1
    )

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


## 1.0 fully connected single layer

In [0]:
model_name = 'fashion_1.0'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)
    W = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    
    XX = tf.reshape(features, [-1, 784])

    # The model
    Y = tf.nn.softmax(tf.matmul(XX, W) + b)
    
    tf.summary.histogram(W.name.replace(':', '_'), W)
    tf.summary.histogram(b.name.replace(':', '_'), b)
        
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # loss function
        cross_entropy = -tf.reduce_sum(tf.one_hot(labels, 10) * tf.log(Y))
        cross_entropy = tf.reduce_mean(cross_entropy)
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            tf.summary.scalar("learning_rate", params["learning_rate"])
            optimizer = tf.train.GradientDescentOptimizer(params["learning_rate"])
            train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)

  
params = {
    "batch_size": 100,
    "learning_rate": 0.000003,
    "pkeep": 0.70,
    "epochs": 50
}
train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

## 2.0 fully connected going deep + stable cross entropy

In [0]:
params["learning_rate"] = 0.003

In [0]:
model_name = 'fashion_2.0'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)
    # five layers and their number of neurons (tha last layer has 10 softmax neurons)
    L = 200
    M = 100
    N = 60
    O = 30
    # When using RELUs, make sure biases are initialised with small *positive* values for example 0.1 = tf.ones([K])/10
    weights = {
        "W1" : tf.Variable(tf.truncated_normal([784, L])),  # 784 = 28 * 28
        "W2" : tf.Variable(tf.truncated_normal([L, M])),
        "W3" : tf.Variable(tf.truncated_normal([M, N])),
        "W4" : tf.Variable(tf.truncated_normal([N, O])),
        "W5" : tf.Variable(tf.truncated_normal([O, 10]))
    }
    biases = {
        "B1" : tf.Variable(tf.zeros([L])),
        "B2" : tf.Variable(tf.zeros([M])),
        "B3" : tf.Variable(tf.zeros([N])),
        "B4" : tf.Variable(tf.zeros([O])),
        "B5" : tf.Variable(tf.zeros([10]))
    }   

    # The model
    XX = tf.reshape(features, [-1, 784])
    Y1 = tf.nn.sigmoid(tf.matmul(XX, weights["W1"]) + biases["B1"])
    Y2 = tf.nn.sigmoid(tf.matmul(Y1, weights["W2"]) + biases["B2"])
    Y3 = tf.nn.sigmoid(tf.matmul(Y2, weights["W3"]) + biases["B3"])
    Y4 = tf.nn.sigmoid(tf.matmul(Y3, weights["W4"]) + biases["B4"])
    Ylogits = tf.matmul(Y4, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)
      
    for k, w in weights.items():
        tf.summary.histogram(k, w)
    for k, b in biases.items():
        tf.summary.histogram(k, b)
        
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)

        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            tf.summary.scalar("learning_rate", params["learning_rate"])
            optimizer = tf.train.GradientDescentOptimizer(params["learning_rate"])
            train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)

  params = {
    "batch_size": 200,
    "learning_rate": 0.000003,
    "pkeep": 0.75,
    "epochs": 50
}

params["learning_rate"] = 0.003
train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

### 2.1 initialization + learning rate decay + relu

In [0]:
model_name = 'fashion_2.1'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)
    # five layers and their number of neurons (tha last layer has 10 softmax neurons)
    L = 200
    M = 100
    N = 60
    O = 30
    # Weights initialised with small random values between -0.2 and +0.2
    # When using RELUs, make sure biases are initialised with small *positive* values for example 0.1 = tf.ones([K])/10
    weights = {
        "W1" : tf.Variable(tf.truncated_normal([784, L], stddev=0.1)),  # 784 = 28 * 28
        "W2" : tf.Variable(tf.truncated_normal([L, M], stddev=0.1)),
        "W3" : tf.Variable(tf.truncated_normal([M, N], stddev=0.1)),
        "W4" : tf.Variable(tf.truncated_normal([N, O], stddev=0.1)),
        "W5" : tf.Variable(tf.truncated_normal([O, 10], stddev=0.1))
    }
    biases = {
        "B1" : tf.Variable(tf.ones([L])/10),
        "B2" : tf.Variable(tf.ones([M])/10),
        "B3" : tf.Variable(tf.ones([N])/10),
        "B4" : tf.Variable(tf.ones([O])/10),
        "B5" : tf.Variable(tf.ones([10])/10)
    }   

    # The model
    XX = tf.reshape(features, [-1, 784])
    Y1 = tf.nn.relu(tf.matmul(XX, weights["W1"]) + biases["B1"])
    Y2 = tf.nn.relu(tf.matmul(Y1, weights["W2"]) + biases["B2"])
    Y3 = tf.nn.relu(tf.matmul(Y2, weights["W3"]) + biases["B3"])
    Y4 = tf.nn.relu(tf.matmul(Y3, weights["W4"]) + biases["B4"])
    Ylogits = tf.matmul(Y4, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)
    
    for k, w in weights.items():
        tf.summary.histogram(k, w)
    for k, b in biases.items():
        tf.summary.histogram(k, b)

    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)
        
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            # the learning rate is: # 0.0001 + 0.003 * (1/e)^(step/max_steps)), i.e. exponential decay from 0.003->0.0001
            max_steps = (50000//params["batch_size"]) * params["epochs"]
            lr = 0.0001 +  tf.train.exponential_decay(params["learning_rate"],
                tf.train.get_global_step(), max_steps, 1/math.e)
            tf.summary.scalar("learning_rate", lr)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            train_step = optimizer.minimize(cross_entropy,
                global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)

  
train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

### 2.2 dropout

In [0]:
model_name = 'fashion_2.2'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)
    # five layers and their number of neurons (tha last layer has 10 softmax neurons)
    L = 200
    M = 100
    N = 60
    O = 30
    # Weights initialised with small random values between -0.2 and +0.2
    # When using RELUs, make sure biases are initialised with small *positive* values for example 0.1 = tf.ones([K])/10
    weights = {
        "W1" : tf.Variable(tf.truncated_normal([784, L], stddev=0.1)),  # 784 = 28 * 28
        "W2" : tf.Variable(tf.truncated_normal([L, M], stddev=0.1)),
        "W3" : tf.Variable(tf.truncated_normal([M, N], stddev=0.1)),
        "W4" : tf.Variable(tf.truncated_normal([N, O], stddev=0.1)),
        "W5" : tf.Variable(tf.truncated_normal([O, 10], stddev=0.1))
    }
    biases = {
        "B1" : tf.Variable(tf.ones([L])/10),
        "B2" : tf.Variable(tf.ones([M])/10),
        "B3" : tf.Variable(tf.ones([N])/10),
        "B4" : tf.Variable(tf.ones([O])/10),
        "B5" : tf.Variable(tf.ones([10])/10)
    }   

    # The model
    XX = tf.reshape(features, [-1, 784])
    Y1 = tf.nn.relu(tf.matmul(XX, weights["W1"]) + biases["B1"])
    Y1d = tf.nn.dropout(Y1, params["pkeep"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0)

    Y2 = tf.nn.relu(tf.matmul(Y1d, weights["W2"]) + biases["B2"])
    Y2d = tf.nn.dropout(Y2, params["pkeep"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0)

    Y3 = tf.nn.relu(tf.matmul(Y2d, weights["W3"]) + biases["B3"])
    Y3d = tf.nn.dropout(Y3, params["pkeep"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0)

    Y4 = tf.nn.relu(tf.matmul(Y3d, weights["W4"]) + biases["B4"])
    Y4d = tf.nn.dropout(Y4, params["pkeep"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0)
    Ylogits = tf.matmul(Y4d, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)
    
    for k, w in weights.items():
        tf.summary.histogram(k, w)
    for k, b in biases.items():
        tf.summary.histogram(k, b)
        
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)
        
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            # the learning rate is: # 0.0001 + 0.003 * (1/e)^(step/max_steps)), i.e. exponential decay from 0.003->0.0001
            max_steps = (50000//params["batch_size"]) * params["epochs"]
            lr = 0.0001 +  tf.train.exponential_decay(params["learning_rate"],
                tf.train.get_global_step(), max_steps, 1/math.e)
            tf.summary.scalar("learning_rate", lr)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)

  
train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

## 3.0 convolutional + AdamOptimizer



In [0]:
model_name = 'fashion_3.0'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)

    # three convolutional layers with their channel counts, and a
    # fully connected layer (tha last layer has 10 softmax neurons)
    L = 4  # first convolutional layer output depth
    M = 8  # second convolutional layer output depth
    N = 12  # third convolutional layer
    O = 200  # fully connected layer

    weights = {
        "W1" : tf.Variable(tf.truncated_normal([5, 5, 1, L], stddev=0.1)),  # 5x5 patch, 1 input channel, K output channels
        "W2" : tf.Variable(tf.truncated_normal([5, 5, L, M], stddev=0.1)),
        "W3" : tf.Variable(tf.truncated_normal([4, 4, M, N], stddev=0.1)),
        "W4" : tf.Variable(tf.truncated_normal([7 * 7 * N, O], stddev=0.1)),
        "W5" : tf.Variable(tf.truncated_normal([O, 10], stddev=0.1))
    }
    biases = {
        "B1" : tf.Variable(tf.ones([L])/10),
        "B2" : tf.Variable(tf.ones([M])/10),
        "B3" : tf.Variable(tf.ones([N])/10),
        "B4" : tf.Variable(tf.ones([O])/10),
        "B5" : tf.Variable(tf.ones([10])/10)
    }   

    # The model
    stride = 1  # output is 28x28
    Y1 = tf.nn.relu(tf.nn.conv2d(features, weights["W1"], strides=[1, stride, stride, 1], padding='SAME') + biases["B1"])
    stride = 2  # output is 14x14
    Y2 = tf.nn.relu(tf.nn.conv2d(Y1, weights["W2"], strides=[1, stride, stride, 1], padding='SAME') + biases["B2"])
    stride = 2  # output is 7x7
    Y3 = tf.nn.relu(tf.nn.conv2d(Y2, weights["W3"], strides=[1, stride, stride, 1], padding='SAME') + biases["B3"])

    # reshape the output from the third convolution for the fully connected layer
    YY = tf.reshape(Y3, shape=[-1, 7 * 7 * N])

    Y4 = tf.nn.relu(tf.matmul(YY, weights["W4"]) + biases["B4"])
    Ylogits = tf.matmul(Y4, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)
    
    for k, w in weights.items():
        tf.summary.histogram(k, w)
    for k, b in biases.items():
        tf.summary.histogram(k, b)
        
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)
        
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            # the learning rate is: # 0.0001 + 0.003 * (1/e)^(step/max_steps)), i.e. exponential decay from 0.003->0.0001
            max_steps = (50000//params["batch_size"]) * params["epochs"]
            lr = 0.0001 +  tf.train.exponential_decay(params["learning_rate"],
                tf.train.get_global_step(), max_steps, 1/math.e)
            tf.summary.scalar("learning_rate", lr)
            optimizer = tf.train.AdamOptimizer(lr)
            train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)

  
train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

### 3.1 more neurons

In [0]:
model_name = 'fashion_3.1'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)

    # three convolutional layers with their channel counts, and a
    # fully connected layer (tha last layer has 10 softmax neurons)
    L = 6  # first convolutional layer output depth
    M = 12  # second convolutional layer output depth
    N = 24  # third convolutional layer
    O = 200  # fully connected layer

    weights = {
        "W1" : tf.Variable(tf.truncated_normal([6, 6, 1, L], stddev=0.1)),  # 5x5 patch, 1 input channel, K output channels
        "W2" : tf.Variable(tf.truncated_normal([5, 5, L, M], stddev=0.1)),
        "W3" : tf.Variable(tf.truncated_normal([4, 4, M, N], stddev=0.1)),
        "W4" : tf.Variable(tf.truncated_normal([7 * 7 * N, O], stddev=0.1)),
        "W5" : tf.Variable(tf.truncated_normal([O, 10], stddev=0.1))
    }
    biases = {
        "B1" : tf.Variable(tf.ones([L])/10),
        "B2" : tf.Variable(tf.ones([M])/10),
        "B3" : tf.Variable(tf.ones([N])/10),
        "B4" : tf.Variable(tf.ones([O])/10),
        "B5" : tf.Variable(tf.ones([10])/10)
    }   

    # The model
    stride = 1  # output is 28x28
    Y1 = tf.nn.relu(tf.nn.conv2d(features, weights["W1"], strides=[1, stride, stride, 1], padding='SAME') + biases["B1"])
    stride = 2  # output is 14x14
    Y2 = tf.nn.relu(tf.nn.conv2d(Y1, weights["W2"], strides=[1, stride, stride, 1], padding='SAME') + biases["B2"])
    stride = 2  # output is 7x7
    Y3 = tf.nn.relu(tf.nn.conv2d(Y2, weights["W3"], strides=[1, stride, stride, 1], padding='SAME') + biases["B3"])

    # reshape the output from the third convolution for the fully connected layer
    YY = tf.reshape(Y3, shape=[-1, 7 * 7 * N])

    Y4 = tf.nn.relu(tf.matmul(YY, weights["W4"]) + biases["B4"])
    Y4d = tf.nn.dropout(Y4, params["pkeep"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0)
    Ylogits = tf.matmul(Y4d, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)
    
    for k, w in weights.items():
        tf.summary.histogram(k, w)
    for k, b in biases.items():
        tf.summary.histogram(k, b)
        
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)
        
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            # the learning rate is: # 0.0001 + 0.003 * (1/e)^(step/max_steps)), i.e. exponential decay from 0.003->0.0001
            max_steps = (50000//params["batch_size"]) * params["epochs"]
            lr = 0.0001 +  tf.train.exponential_decay(params["learning_rate"],
                tf.train.get_global_step(), max_steps, 1/math.e)
            tf.summary.scalar("learning_rate", lr)
            optimizer = tf.train.AdamOptimizer(lr)
            train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)


train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

## 4.0 Convolutional + batch normalization + leaky relu

In [0]:
model_name = 'fashion_4.0'


def model_fn(features, labels, mode, params):
    tf.summary.image('image', features)

    # five layers and their number of neurons (tha last layer has 10 softmax neurons)
    L = 24
    M = 48
    N = 64
    O = 200

    # Weights initialised with small random values between -0.2 and +0.2
    weights = {
        "W1" : tf.Variable(tf.truncated_normal([6, 6, 1, L], stddev=0.1)),  # 5x5 patch, 1 input channel, K output channels
        "W2" : tf.Variable(tf.truncated_normal([5, 5, L, M], stddev=0.1)),
        "W3" : tf.Variable(tf.truncated_normal([4, 4, M, N], stddev=0.1)),
        "W4" : tf.Variable(tf.truncated_normal([7 * 7 * N, O], stddev=0.1)),
        "W5" : tf.Variable(tf.truncated_normal([O, 10], stddev=0.1))
    }
    # When using RELUs, make sure biases are initialised with small *positive* values for example 0.1 = tf.ones([K])/10
    biases = {
        "B4" : tf.Variable(tf.ones([O])/10),
        "B5" : tf.Variable(tf.ones([10])/10)
    }  

    def compatible_convolutional_noise_shape(Y):
        noiseshape = tf.shape(Y)
        noiseshape = noiseshape * tf.constant([1,0,0,1]) + tf.constant([0,1,1,0])
        return noiseshape

    # The model
    stride = 1  # output is 28x28
    Y1l = tf.nn.conv2d(features, weights["W1"], strides=[1, stride, stride, 1], padding='SAME')
    Y1bn = tf.layers.batch_normalization(Y1l, training=mode == tf.estimator.ModeKeys.TRAIN)
    Y1r = tf.nn.leaky_relu(Y1bn)
    Y1 = tf.nn.dropout(Y1r, params["pkeep"], compatible_convolutional_noise_shape(Y1r))
    stride = 2  # output is 14x14
    Y2l = tf.nn.conv2d(Y1, weights["W2"], strides=[1, stride, stride, 1], padding='SAME')
    Y2bn = tf.layers.batch_normalization(Y2l, training=mode == tf.estimator.ModeKeys.TRAIN)
    Y2r = tf.nn.leaky_relu(Y2bn)
    Y2 = tf.nn.dropout(Y2r, params["pkeep"], compatible_convolutional_noise_shape(Y2r))
    stride = 2  # output is 7x7
    Y3l = tf.nn.conv2d(Y2, weights["W3"], strides=[1, stride, stride, 1], padding='SAME')
    Y3bn = tf.layers.batch_normalization(Y3l, training=mode == tf.estimator.ModeKeys.TRAIN)
    Y2r = tf.nn.leaky_relu(Y3bn)
    Y3 = tf.nn.dropout(Y2r, params["pkeep"], compatible_convolutional_noise_shape(Y2r))
    
    # reshape the output from the third convolution for the fully connected layer
    YY = tf.reshape(Y3, shape=[-1, 7 * 7 * N])

    Y4l = tf.matmul(YY, weights["W4"]) + biases["B4"]
    Y4bn = tf.layers.batch_normalization(Y4l, training=mode == tf.estimator.ModeKeys.TRAIN)
    Y4r = tf.nn.relu(Y4bn)
    Y4 = tf.nn.dropout(Y4r, params["pkeep"])
    Ylogits = tf.matmul(Y4, weights["W5"]) + biases["B5"]
    Y = tf.nn.softmax(Ylogits)

    
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        # cross-entropy loss function (= -sum(Y_i * log(Yi)) ), normalised for batches of 100  images
        # TensorFlow provides the softmax_cross_entropy_with_logits function to avoid numerical instability
        # problems with log(0) which is NaN
        cross_entropy = tf.losses.softmax_cross_entropy(
            weights=params["batch_size"],
            onehot_labels=tf.one_hot(labels, 10),
            logits=Ylogits)
        
        # % of correct answers found in batch
        predictions = tf.argmax(Y,1)
        accuracy = tf.metrics.accuracy(predictions, labels)

        evalmetrics = {"accuracy/mnist": accuracy}
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.summary.scalar("accuracy/mnist", accuracy[1])
            # the learning rate is: # 0.0001 + 0.003 * (1/e)^(step/max_steps)), i.e. exponential decay from 0.003->0.0001
            max_steps = (50000//params["batch_size"]) * params["epochs"]
            lr = 0.0001 +  tf.train.exponential_decay(params["learning_rate"],
                tf.train.get_global_step(), max_steps, 1/math.e)
            tf.summary.scalar("learning_rate", lr)
            optimizer = tf.train.AdamOptimizer(lr)
            # this is needed for batch normalization, but has no effect otherwise
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_step = optimizer.minimize(cross_entropy,
                                            global_step=tf.train.get_global_step())
        else:
            train_step = None
    else:
        cross_entropy = None
        train_step = None
        evalmetrics = None

    return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"classid": predictions},
            loss=cross_entropy,
            train_op=train_step,
            eval_metric_ops=evalmetrics)


train_and_evaluate(os.path.join(TB_DIR, model_name), params, model_fn)

# clean all

In [0]:
!rm -r Graph/*

In [0]:
!ls Graph/