# The AllConv Net

A parameter efficient ConvNet as baseline.

In [1]:
import os

import numpy as np
from numpy.random import choice
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
from torchvision import datasets
import torchvision

import sys
sys.path.append('../utils')
sys.path.append('../models')
import utils
import all_conv_net

In [2]:
data_dir = '../data'

# values for normalisation
channel_means = (0.5071, 0.4865, 0.4409)
channel_standard_devs = (0.2673, 0.2564, 0.2762)

transformation = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(channel_means, channel_standard_devs)
])
batchsize=64

test_set = datasets.CIFAR100(data_dir, train=False, transform=transformation, download=True)
test_loader = DataLoader(test_set, batch_size=batchsize)

rest = datasets.CIFAR100(data_dir, train=True, transform=transformation, download=True)
train_loader, validation_loader = utils.train_validation_split(rest, fraction=0.1, batchsize=batchsize)

print(f'training batches: {len(train_loader)}')
print(f'validation batches: {len(validation_loader)}')
print(f'test batches: {len(test_loader)}')
print(f'batch size: {batchsize}')

Files already downloaded and verified
Files already downloaded and verified
training batches: 704
validation batches: 79
test batches: 157
batch size: 64


In [3]:
#[x.shape for x, y in train_loader]

In [4]:
#[x.shape for x, y in validation_loader]

### Dimensions

#### convolutional layer

$$W_{output} = \Bigl\lfloor \frac{W_{input} - F + 2*P}{S} \Bigr\rfloor+ 1$$

#### pooling layer

$$W_{output} = \frac{W_{input} - F}{S} + 1$$

with:  
W: width
F: filter width  
P: padding  
S: stride

... same goes for height (images are quadratic anyway)

In [5]:
#( (16 - 3 + 2*1 ) / 2 ) + 1

## The all-convolutional net

From ["Striving for Simplicity: The All Convolutional Net" from Springenberg et. al](https://arxiv.org/abs/1412.6806). This architecture simply downsamples the input with a couple of convolutions as in regular convolutional nets. Then, it arrives at a number of channles that is equal to the number of classes. Taking the mean of each channel (a feature map of very small height and width) gives a vector of the same length as the number of classes - the same output as in a fully connected layer. In this implementation, this architecture has only ~0.3 million paramters when starting with 32 channels. Downsampling is done with stride=2 convolutions.

It's easy to extend the architecture with optional residual shortcuts. This allows to compare the residual version with the non-residual version of the same net.

Interesting, Adam doesn't work at all with this net. Hence, I use only SGD with momentum.

In [None]:
print('MODEL: all-convolution net')
net = all_conv_net.AllCNN(c=64, residuals=False)
print(f'parameters: {utils.count_parameters(net, in_millions=True):.2f} million\n')

optimizer = torch.optim.SGD(net.parameters(),
                            lr=0.1,
                            momentum=0.9,
                            weight_decay=0  #0.00002
                           )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.75, patience=5, threshold=0.01, verbose=True)

utils.train_model(net, train_loader, validation_loader, optimizer, scheduler, max_epoch=500)

MODEL: all-convolution net
parameters: 1.387044 million

epoch	training-CE	validation-CE	validation-accuracy (%)
0	4.57		4.47		12.50
1	4.47		4.37		0.00
2	4.40		4.28		0.00
3	4.32		4.10		12.50
4	4.24		3.98		0.00
5	4.17		3.90		0.00
6	4.11		3.82		0.00
7	4.05		3.72		0.00
8	3.97		3.64		0.00
9	3.92		3.54		0.00
10	3.87		3.45		12.50
11	3.80		3.34		25.00
12	3.75		3.26		12.50
13	3.69		3.14		12.50
14	3.64		3.08		12.50
15	3.60		3.03		25.00
16	3.54		2.99		25.00
17	3.51		2.91		12.50
18	3.44		2.85		12.50
19	3.41		2.79		25.00
20	3.38		2.68		12.50
21	3.32		2.67		25.00
22	3.28		2.61		25.00
23	3.26		2.58		25.00
24	3.24		2.53		25.00
25	3.20		2.51		25.00
26	3.16		2.45		25.00
27	3.14		2.38		25.00
28	3.10		2.37		25.00
29	3.09		2.37		37.50
30	3.06		2.34		25.00
31	3.03		2.28		50.00
32	3.00		2.26		37.50
33	2.98		2.25		50.00
34	2.97		2.19		25.00
35	2.94		2.17		37.50
36	2.92		2.17		50.00
37	2.91		2.15		50.00
38	2.88		2.13		37.50
39	2.86		2.12		37.50
40	2.86		2.09		25.00
41	2.84		2.06		25.00
42	2.81		2.03		12.50
43

In [None]:
utils.plot_error_curves(utils.error_stats, error_name='cross-entropy loss')
utils.test_set_evaluation(net.cuda(), test_loader, just_print=True)

Weird observation:  
Validation error is consistently lower than training error.

## The all convolutional ResNet

Compare the above results with the same architecture (and the same optimization scheme) with the addition of residual shortcuts.

In [None]:
print('MODEL: all-convolution ResNet')
net = all_conv_net.AllCNN(c=64, residuals=True)  # same net with residual shortcuts!
print(f'parameters: {utils.count_parameters(net, in_millions=True):.2f} million\n')

optimizer = torch.optim.SGD(net.parameters(),
                            lr=0.1,
                            momentum=0.9,
                            weight_decay=0  #0.00002
                           )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.75, patience=5, threshold=0.01, verbose=True)

utils.train_model(net, train_loader, validation_loader, optimizer, scheduler, max_epoch=500)