In [2]:
import torch.nn as nn
import torch
import torchvision
import torch.optim as optim
import torch.nn.functional as F
import sys
sys.path.append('../mnist')
import matplotlib.pyplot as plt

import cnn_model as model
from torch.autograd import Variable


In building a model you will need: 
- a model architecture with model parameters
- an optimizer with optimizer parameters
- a training and a testing dataset


The training and testing dataset is fed to the model in batches (of size specified below) using data loaders.

In [3]:
batch_size_train = 256
batch_size_test = 256

train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('./files/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True,pin_memory=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('./files/', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_test, shuffle=True,pin_memory=True)

The model and the model training are specified by a parameters dictionary. Very little validation or sanity checking is done on the parameters at this point (#TODO). If no parameters are specified, a set of default ones will be used.

In [7]:
parameters = {
            'max_pool_kernel_size':2,
            'max_pool_stride':2,
            'max_pool_padding':0,

            'conv1_kernel_size':3,
            'conv1_stride':1,
            'conv1_padding':1,
            'conv1_in_channels':1,
            'conv1_out_channels':32,

            'conv2_kernel_size':3,
            'conv2_stride':1,
            'conv2_padding':1,
            'conv2_in_channels':32,
            'conv2_out_channels':64,

            'conv3_kernel_size':3,
            'conv3_stride':1,
            'conv3_padding':1,
            'conv3_in_channels':64,
            'conv3_out_channels': 256,

            'linear1_output':1000,
            'linear2_output':500,

            'learning_rate':0.005,
            'momentum':0.5
}

The model architecture and the optimizers can be defined and build with the provided constructors. Training and evaluating is done by calling the relevant methods.

In [None]:
net = model.Model(parameters)

optimizer = model.default_optimizer(net, learning_rate=0.005, momentum=0.5)
num_epochs = 1
use_cuda = False
if use_cuda:
    net = net.cuda()
if use_cuda and torch.cuda.device_count() > 1:
    net = nn.DataParallel(net)
model.train_model(net, num_epochs,optimizer, train_loader, log_interval = 50,use_cuda=use_cuda)
model.eval_model(net, test_loader, use_cuda=use_cuda)



Optimal (where optimality depends on the chosen metric) parameters can be found by hyperparameter tuning. This is done by sequentially (not parallely at this point) evaluating a set of parameters. In the example below, we are optimizing on the number of channels of the third convolutional layer.

In [4]:
parameters_to_tune = {
            'max_pool_kernel_size':2,
            'max_pool_stride':2,
            'max_pool_padding':0,

            'conv1_kernel_size':3,
            'conv1_stride':1,
            'conv1_padding':1,
            'conv1_in_channels':1,
            'conv1_out_channels':32,

            'conv2_kernel_size':3,
            'conv2_stride':1,
            'conv2_padding':1,
            'conv2_in_channels':32,
            'conv2_out_channels':64,

            'conv3_kernel_size':3,
            'conv3_stride':1,
            'conv3_padding':1,
            'conv3_in_channels':64,
            'conv3_out_channels': [32,64,128,256],

            'linear1_output':1000,
            'linear2_output':500,

            'learning_rate':0.005,
            'momentum':0.5
}
optimal = model.hyperpameter_tuning(test_loader, train_loader,parameters_to_tune,'accuracy')

{'max_pool_kernel_size': 2, 'max_pool_stride': 2, 'max_pool_padding': 0, 'conv1_kernel_size': 3, 'conv1_stride': 1, 'conv1_padding': 1, 'conv1_in_channels': 1, 'conv1_out_channels': 32, 'conv2_kernel_size': 3, 'conv2_stride': 1, 'conv2_padding': 1, 'conv2_in_channels': 32, 'conv2_out_channels': 64, 'conv3_kernel_size': 3, 'conv3_stride': 1, 'conv3_padding': 1, 'conv3_in_channels': 64, 'conv3_out_channels': [32, 64, 128, 256], 'linear1_output': 1000, 'linear2_output': 500, 'learning_rate': 0.005, 'momentum': 0.5}
current parameters:  {'max_pool_kernel_size': 2, 'max_pool_stride': 2, 'max_pool_padding': 0, 'conv1_kernel_size': 3, 'conv1_stride': 1, 'conv1_padding': 1, 'conv1_in_channels': 1, 'conv1_out_channels': 32, 'conv2_kernel_size': 3, 'conv2_stride': 1, 'conv2_padding': 1, 'conv2_in_channels': 32, 'conv2_out_channels': 64, 'conv3_kernel_size': 3, 'conv3_stride': 1, 'conv3_padding': 1, 'conv3_in_channels': 64, 'conv3_out_channels': 32, 'linear1_output': 1000, 'linear2_output': 500, 

  so = F.log_softmax(o)



Test set: Avg. loss: 2.3016, Accuracy: 1160/10000 (11%)


Test set: Avg. loss: 2.0542, Accuracy: 6227/10000 (62%)

current parameters:  {'max_pool_kernel_size': 2, 'max_pool_stride': 2, 'max_pool_padding': 0, 'conv1_kernel_size': 3, 'conv1_stride': 1, 'conv1_padding': 1, 'conv1_in_channels': 1, 'conv1_out_channels': 32, 'conv2_kernel_size': 3, 'conv2_stride': 1, 'conv2_padding': 1, 'conv2_in_channels': 32, 'conv2_out_channels': 64, 'conv3_kernel_size': 3, 'conv3_stride': 1, 'conv3_padding': 1, 'conv3_in_channels': 64, 'conv3_out_channels': 64, 'linear1_output': 1000, 'linear2_output': 500, 'learning_rate': 0.005, 'momentum': 0.5}

Test set: Avg. loss: 2.3022, Accuracy: 974/10000 (9%)


Test set: Avg. loss: 2.1910, Accuracy: 5063/10000 (50%)

current parameters:  {'max_pool_kernel_size': 2, 'max_pool_stride': 2, 'max_pool_padding': 0, 'conv1_kernel_size': 3, 'conv1_stride': 1, 'conv1_padding': 1, 'conv1_in_channels': 1, 'conv1_out_channels': 32, 'conv2_kernel_size': 3, 'conv2_stride': 

In [5]:
print(optimal)

(0.5063, {'max_pool_kernel_size': 2, 'max_pool_stride': 2, 'max_pool_padding': 0, 'conv1_kernel_size': 3, 'conv1_stride': 1, 'conv1_padding': 1, 'conv1_in_channels': 1, 'conv1_out_channels': 32, 'conv2_kernel_size': 3, 'conv2_stride': 1, 'conv2_padding': 1, 'conv2_in_channels': 32, 'conv2_out_channels': 64, 'conv3_kernel_size': 3, 'conv3_stride': 1, 'conv3_padding': 1, 'conv3_in_channels': 64, 'conv3_out_channels': 64, 'linear1_output': 1000, 'linear2_output': 500, 'learning_rate': 0.005, 'momentum': 0.5})
