Note: make sure to restart the runtime after running the next cell
If any changes are made to the residuals library on git, this cell must be run and the runtime must be restarted to get the latest version

In [None]:
!rm -fr ResidualNets
!git clone https://github.com/rohin-dasari/ResidualNets.git
!cd ResidualNets && pip install -e . && cd ../

In [1]:
import torch
from residual import ResidualConfig
from residual.utils import Trainer

# Model Creation Strategy:
- Start with a single layer residual model
- Create N models, each with a set of parameters from a defined search space
- To limit the size of the search space, we will only search the convolutional filters and kernel sizes:
  - [32, 64, 128] x [3, 5, 7]

- Train 9 different models for 30 epochs
- Get a list of max performances for each model:
  - [64, 83, 85, 95, 96, 96, 96, 96, 97]
- Set a threshold (might be the 80th percentile)
- Choose the model with the lowest number of parameters after thresholding
- Add a new layer; keep parameters of previous model; add all combinations in search space
  - skip models that have more than 5M parameters
  - train these models for another 30 epochs
  - keep the max

In [2]:
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
from itertools import product
from copy import deepcopy
import numpy as np

layers = 4
filters = [32, 64, 128]
kernel_sizes = [3, 5, 7]
histories = {}
best_config = []
best_performance = 0
for layer in range(layers):
  layer_configs = []
  for filter_size, kernel_size in product(filters, kernel_sizes):
    config = deepcopy(best_config)
    config.append(ResidualConfig(channels=filter_size, conv_kernel_size=kernel_size, skip_kernel_size=1))
    trainer = Trainer(
          model_config=config,
          n_classes=10,
          optimizer='Adam',
          criterion=torch.nn.CrossEntropyLoss(),
          trainset=testset,
          testset=testset,
          verbosity=0,
          checkpoint_path=f'{layer}-{filter_size}-{kernel_size}'
      )
    if trainer.model.get_parameter_count() >= 5e6:
      continue
    history = trainer.train(30 + (10*(layer+1)))
    histories[(layer, filter_size, kernel_size)] = {}
    histories[(layer, filter_size, kernel_size)]['parameters'] = trainer.model.get_parameter_count()
    histories[(layer, filter_size, kernel_size)]['history'] = history
    layer_configs.append((filter_size, kernel_size, history['val']['acc'][-1], trainer.model.get_parameter_count()))

    print(config, history['val']['acc'][-1])

  performance = [c[-2] for c in layer_configs]
  top_thresh = np.percentile(performance, 80)
  print(f'thresh: {top_thresh}')
  top_perf = list(filter(lambda a: a[-2] >= top_thresh, layer_configs))
  best_layer = min(top_perf, key=lambda element: element[-1])
  if best_layer[-2] > best_performance:
    best_performance = best_layer[-2]
    best_config.append(ResidualConfig(channels=best_layer[0], conv_kernel_size=best_layer[1], skip_kernel_size=1))
    print('current config: ')
    print(best_config)
  else:
    print('reached best performance')
    print(best_config)
    break
    

[ResidualConfig(channels=32, conv_kernel_size=3, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.6886
[ResidualConfig(channels=32, conv_kernel_size=5, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.7381
[ResidualConfig(channels=32, conv_kernel_size=7, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.7405
[ResidualConfig(channels=64, conv_kernel_size=3, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.752
[ResidualConfig(channels=64, conv_kernel_size=5, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.7971
[ResidualConfig(channels=64, conv_kernel_size=7, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.8606
[ResidualConfig(channels=128, conv_kernel_size=3, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.787
[ResidualConfig(channels=128, conv_kernel_size=5, in_size=(1, 64, 8, 8), skip_kernel_size=1, stride=1, n_blocks=2)] 0.8534
[ResidualConfig(channels

In [3]:
config = [
      ResidualConfig(channels=64, conv_kernel_size=7, in_size=None, skip_kernel_size=1),
      ResidualConfig(channels=32, conv_kernel_size=5, in_size=None, skip_kernel_size=1),
      ResidualConfig(channels=64, conv_kernel_size=3, in_size=None, skip_kernel_size=1)
    ]
trainer = Trainer(
          model_config=config,
          n_classes=10,
          optimizer='Adam',
          criterion=torch.nn.CrossEntropyLoss(),
          trainset=testset,
          testset=testset,
          verbosity=1
      )
trainer.model.get_parameter_count()

1089418

In [None]:
histories = trainer.train(30)