## Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import math
import time

#tds
import torchvision

#git
#from __future__ import print_function
#import argparse
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, ToTensor, Normalize, Resize
from torch.utils.data import DataLoader

import json

## Loads the data

In [2]:
def get_data_loaders(train_batch_size, test_batch_size, size=(224,224)):

    mnist = MNIST('data/', download=False, train=True).train_data.float()

    data_transform = Compose([ Resize(size) ,ToTensor(), Normalize((mnist.mean()/255,), (mnist.std()/255,))])

    train_loader = DataLoader(MNIST('data/', download=True, transform=data_transform, train=True),
                              batch_size=train_batch_size, shuffle=True)

    test_loader = DataLoader(MNIST('data/', download=False, transform=data_transform, train=False),
                            batch_size=test_batch_size, shuffle=False)

    # print('loaded the mnist data')
    return train_loader, test_loader


## Help functions for ResNet

In [3]:
# counts trainable weights in a model
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Vill ha bilder av storlek 224x224
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, 
    # dilation=1, groups=1, bias=True, padding_mode='zeros')
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        """
        planes is the number of filters we want
        inplanes can differ from planes as we can get inputs from multiple places
        """
        super(BasicBlock, self).__init__()
        
        norm_layer = nn.BatchNorm2d
        # groups=1,
        # base_width=64
        # dilation = 1
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride   # what is the point of this

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


## Translates HyperMapper json to pyTorch ResNet module

In [4]:
class json2ResNet(nn.Module):
    # ResNet(BasicBlock, [2, 2, 2, 2])
    def __init__(self, block, filters, filter_upd, blocks, kernel_size=0, pool=0, reduce=0):
        super(json2ResNet, self).__init__()
        
        num_classes = 10
        norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        self.inplanes = filters #[0]  # 64
        
        # self.groups = 1    # This is for ResNeXt, but only used for bottlenecks..
        # self.base_width = 64
        # Filter before residual layers
        if kernel_size == 0: 
            self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=3, stride=1, padding=1,
                                   bias=False)
        else:
            # Halves dimension
            reduce *= 2
            self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
                                   bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.maxpool = pool
        if pool:
            # 2x2 pooling halves the dimension
            reduce *= 2

        # ResNet for NAS
        reslays = 0
        for i, b in enumerate(blocks): # len blocks = 4
            if b > 0:
                reslays += 1
                key = 'layer' + str(reslays)
                if i == 0:
                    lay = self._make_layer(block, filters, b)
                else:
                    reduce *= 2
                    filters = int(np.round(filters * filter_upd))
                    lay = self._make_layer(block, filters, b, stride=2)

                setattr(self, key, lay)
        
        self.reslays = reslays
       
        # End phase, Improve this
        pixels = 1
        if reduce:
            if reduce <= 16:
                reduce *= 2
                pixels = int((32/reduce)**2)
                self.avgpool = nn.AvgPool2d(kernel_size=2)
            else:
                self.avgpool = nn.AvgPool2d(kernel_size=1)
            
        else:
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            
        self.fc = nn.Linear(filters * block.expansion * pixels, num_classes)
        
        # What is this??? weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
                
        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        """
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
        """            
        
    def _make_layer(self, block, planes, blocks, stride=1):
        norm_layer = self._norm_layer
        downsample = None
        # previous_dilation = 1
        # dilate=False
        
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion  # block expansion = 1 for basic block
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        if self.maxpool:
            mp = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
            x = mp(x)
        #print(x.shape)
        
        # NAS pass
        for idx in range(self.reslays):
            key = 'layer' + str(idx+1)
            #print('fp: ', key)
            x = getattr(self, key)(x)
            #print(x.shape)
        
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        # We do not need softmax since it is done inside nn.CrossEntropyLoss()
        return x

    def forward(self, x):
        return self._forward_impl(x)
    

## json scenario 

In [5]:
def create_scenario2(RS, BO=0, big = False, output_path = "stupid.csv"):
    scenario = {}
    scenario["application_name"] = "resnet"
    scenario["optimization_objectives"] = ["Value"]
    scenario["input_parameters"] = {}
    scenario["output_data_file"] = output_path
    scenario["design_of_experiment"] = {}

    # doe: nbr of random samples
    # BO: nbr op BO iters. BO = 0 means only random sampling
    scenario["optimization_iterations"] = BO
    scenario["design_of_experiment"]["doe_type"] = "random sampling"
    scenario["design_of_experiment"]["number_of_samples"] = RS

    
    #fil = [64, 64,128,254,512]
    #blk = [1,1,1,1]
    #layer_data = (fil,blk)
    
    # number of layers containing residual blocks
    """
    n_layers = {}
    n_layers["parameter_type"] = "integer"
    n_layers["values"] = [1, 4]    # 4 options
    """
    # number of filters in the first blocks. 
    n_filters = {}
    n_filters["parameter_type"] = "ordinal"
    # n_filters["values"] = [2**i for i in range(1, int(math.log(64, 2)) + 1)] # 2-64, 6 options
    n_filters["values"] = [1, 2, 4, 8, 12, 16, 24, 32]
    
    # how to update the number of filters between the layers. Can never reach zero
    filter_upd = {}
    filter_upd["parameter_type"] = "ordinal"
    filter_upd["values"] = [0.5001, 1, 1.5, 2] # add bigger option?
    
    """
    # number of blocks in layer i
    n_blocks = {}
    n_blocks["parameter_type"] = "integer"
    n_blocks["values"] = [1, 4]    # 4 options
    """
    # number of blocks in layer i, n={3, 5, 7, 9, 18} gives {20, 32, 44, 56, 110} which is used in the paper
    n_blocks1 = {}
    n_blocks1["parameter_type"] = "ordinal"
    n_blocks1["values"] = [0,1,2,3,5,7,9,18]
    
    n_blocks2 = {}
    n_blocks2["parameter_type"] = "ordinal"
    n_blocks2["values"] = [0,1,2,3,5,7,9,18]
    
    n_blocks3 = {}
    n_blocks3["parameter_type"] = "ordinal"
    n_blocks3["values"] = [0,1,2,3,5,7,9,18]

    n_blocks4 = {}
    n_blocks4["parameter_type"] = "ordinal"
    n_blocks4["values"] = [0,1,2,3,5,7,9,18]

    # size of first conv. optional
    conv0 = {}
    conv0["parameter_type"] = "categorical"
    conv0["values"] = ['3', '7']    # 2 options
    
    # Pool before residual layers, optional
    pool = {}
    pool["parameter_type"] = "categorical"
    pool["values"] = ['no', 'yes']    # 2 options
    
    # Pool to size 1 after residual layers, optional
    reduce = {}
    reduce["parameter_type"] = "categorical"
    reduce["values"] = ['yes', 'no']    # 2 options
        
    # This is the parameters that will be optimized
    
    # scenario["input_parameters"]["n_layers"] = n_layers      # 4 options
    scenario["input_parameters"]["n_filters"] = n_filters    # 8 options
    scenario["input_parameters"]["filter_upd"] = filter_upd  # 4 options
    # scenario["input_parameters"]["n_blocks"] = n_blocks      # 4 options
    scenario["input_parameters"]["n_blocks1"] = n_blocks1    # 8 options
    scenario["input_parameters"]["n_blocks2"] = n_blocks2    # 8 options
    scenario["input_parameters"]["n_blocks3"] = n_blocks3    # 8 options
    scenario["input_parameters"]["n_blocks4"] = n_blocks4    # 8 options
    
    scenario["input_parameters"]["conv0"] = conv0            # 2 options
    scenario["input_parameters"]["pool"] = pool              # 2 options
    scenario["input_parameters"]["reduce"] = reduce          # 2 options, total: 8*4*8*8*8*8*2*2*2 = 800'000
    


    %cd
    # different paths for differnt computers
    # %cd "PycharmProjects/samuel_nas/my_hypermapper/example_scenarios/quick_start"
    %cd "Documents/kurser/exjobb/samuel_nas/my_hypermapper/example_scenarios/quick_start"
    with open("example_resnet_scenario2.json", "w") as scenario_file:
        json.dump(scenario, scenario_file, indent=4)


In [6]:
"""
def create_scenario(RS, BO=0, big = False, output_path = "stupid.csv"):
    scenario = {}
    scenario["application_name"] = "resnet"
    scenario["optimization_objectives"] = ["Value"]
    scenario["input_parameters"] = {}
    scenario["output_data_file"] = output_path
    scenario["design_of_experiment"] = {}

    # doe: nbr of random samples
    # BO: nbr op BO iters. BO = 0 means only random sampling
    scenario["optimization_iterations"] = BO
    scenario["design_of_experiment"]["doe_type"] = "random sampling"
    scenario["design_of_experiment"]["number_of_samples"] = RS

    
    #fil = [64, 64,128,254,512]
    #blk = [1,1,1,1]
    #layer_data = (fil,blk)
    
    # number of layers containing residual blocks
    n_layers = {}
    n_layers["parameter_type"] = "ordinal"
    if big:
        n_layers["values"] = [1, 2, 3, 4]
    else:
        n_layers["values"] = [1,2]
    
    # number of filters in the blocks in layer i and the initial conv
    n_filters0 = {}
    n_filters0["parameter_type"] = "ordinal"
    n_filters0["values"] = [2**i for i in range(1, int(math.log(64, 2)) + 1)] # 2-64

    n_filters1 = {}
    n_filters1["parameter_type"] = "ordinal"
    n_filters1["values"] = [2**i for i in range(1, int(math.log(64, 2)) + 1)] # 2-64
    
    n_filters2 = {}
    n_filters2["parameter_type"] = "ordinal"
    n_filters2["values"] = [2**i for i in range(1, int(math.log(128, 2)) + 1)] # 2-128
    
    if big:
        n_filters3 = {}
        n_filters3["parameter_type"] = "ordinal"
        n_filters3["values"] = [2**i for i in range(1, int(math.log(256, 2)) + 1)] # 2-256

        n_filters4 = {}
        n_filters4["parameter_type"] = "ordinal"
        n_filters4["values"] = [2**i for i in range(1, int(math.log(512, 2)) + 1)] # 2-512

    # number of blocks in layer i
    n_blocks1 = {}
    n_blocks1["parameter_type"] = "ordinal"
    n_blocks1["values"] = [1,2]
    
    n_blocks2 = {}
    n_blocks2["parameter_type"] = "ordinal"
    n_blocks2["values"] = [1,2]
    
    if big:
        n_blocks3 = {}
        n_blocks3["parameter_type"] = "ordinal"
        n_blocks3["values"] = [1,2]

        n_blocks4 = {}
        n_blocks4["parameter_type"] = "ordinal"
        n_blocks4["values"] = [1,2]

    # This is the parameters that will be optimized
    
    # number of layers containing residual blocks
    scenario["input_parameters"]["n_layers"] = n_layers

    # number of filters in the blocks in layer i and the initial conv
    scenario["input_parameters"]["n_filters0"] = n_filters0
    scenario["input_parameters"]["n_filters1"] = n_filters1
    scenario["input_parameters"]["n_filters2"] = n_filters2
    if big:
        scenario["input_parameters"]["n_filters3"] = n_filters3
        scenario["input_parameters"]["n_filters4"] = n_filters4

    # number of blocks in layer i
    scenario["input_parameters"]["n_blocks1"] = n_blocks1
    scenario["input_parameters"]["n_blocks2"] = n_blocks2
    if big:
        scenario["input_parameters"]["n_blocks3"] = n_blocks3
        scenario["input_parameters"]["n_blocks4"] = n_blocks4


    %cd
    # different paths for differnt computers
    # %cd "PycharmProjects/samuel_nas/my_hypermapper/example_scenarios/quick_start"
    %cd "Documents/kurser/exjobb/samuel_nas/my_hypermapper/example_scenarios/quick_start"
    with open("example_resnet_scenario.json", "w") as scenario_file:
        json.dump(scenario, scenario_file, indent=4)
"""

'\ndef create_scenario(RS, BO=0, big = False, output_path = "stupid.csv"):\n    scenario = {}\n    scenario["application_name"] = "resnet"\n    scenario["optimization_objectives"] = ["Value"]\n    scenario["input_parameters"] = {}\n    scenario["output_data_file"] = output_path\n    scenario["design_of_experiment"] = {}\n\n    # doe: nbr of random samples\n    # BO: nbr op BO iters. BO = 0 means only random sampling\n    scenario["optimization_iterations"] = BO\n    scenario["design_of_experiment"]["doe_type"] = "random sampling"\n    scenario["design_of_experiment"]["number_of_samples"] = RS\n\n    \n    #fil = [64, 64,128,254,512]\n    #blk = [1,1,1,1]\n    #layer_data = (fil,blk)\n    \n    # number of layers containing residual blocks\n    n_layers = {}\n    n_layers["parameter_type"] = "ordinal"\n    if big:\n        n_layers["values"] = [1, 2, 3, 4]\n    else:\n        n_layers["values"] = [1,2]\n    \n    # number of filters in the blocks in layer i and the initial conv\n    n_f

## Trains a network and returns validation performance

In [7]:
def trainer(network, train_data, test_data, epochs=1):
    
    # Always uses cross entropy as loss function
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(network.parameters())
        
    # Train for a given number of epochs (1)
    t0 = time.perf_counter()
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_data, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = network(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training it took ', (time.perf_counter() - t0)/60, ' minutes to train')

    # Validates performance on unseen data
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_data:
            images, labels = data
            outputs = network(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    #print('Accuracy of the network on the 10000 test images: %d %%' % (
     #   100 * correct / total))

    return total / correct # inverse accuracy for minimization

## Objective function for HyperMapper to optimize

In [8]:
def ResNet_function(X):
    """
    Compute the error rate on MNIST after training for a given time.
    :param X: dictionary containing the hyperparameters describing a network.
    :return: the validation performance of the network described by X
    """
    
    # Do the proper preprocessing they do in section 4.2 in resnetpaper
    batch_size_train = 128
    batch_size_test = 1000
    size = (32,32)     # ResNet is made for 224, Mnist is 28, Cifar-10 is 32
    train_loader, test_loader = get_data_loaders(batch_size_train, batch_size_test, size=size)

    # nbr_layers = X['n_layers']
    #filters = [X['n_filters0']]
    blocks = []
    # We only use the n_layers first parameters. use the active-stategy?
    for idx in range(4):
        key = 'n_blocks' + str(idx + 1)
        blocks.append(X[key])
        
    filters = X['n_filters']
    filter_upd = X['filter_upd']
    # blocks = X['n_blocks']
    kernel_size = X['conv0']
    pool = X['pool']
    reduce = X['reduce']
    """if X['conv0'] = 0:
        kernel_size = 3
    else:
        kernel_size = 3
    """
    my_net = json2ResNet(BasicBlock, filters, filter_upd, blocks, kernel_size=kernel_size, pool=pool, reduce=reduce)
    # print(my_net)
    # print('parmas: ', count_params(my_net))
    # print('we got a resnet by num lay: ', nbr_layers, 'filters: ', filters, 'blocks: ', blocks)
    # loss = trainer(my_net, train_loader, test_loader, epochs=1)
    
    
    dataiter = iter(train_loader)
    images, labels = dataiter.next()
    outputs = my_net(images)    
    loss = 1
        
    print('accuracy: ', 1/loss)
    #print('\n')
    return loss

## Basically the main method. Optimizes the given function based on the given scenario. 

### Stores the results in op.cvs

In [9]:
# creates a problem scenario and saves it as example_mnist_scenario.json
rs = 10
bo = 0
op = "stupid.csv"
#op = 'mnist_BO_10+40.csv'
create_scenario2(rs, BO = bo, big=False, output_path=op)

# It is a bit messy when I jump between folders. I must admit
%cd
# %cd "PycharmProjects/samuel_nas/my_hypermapper/scripts"
%cd "Documents/kurser/exjobb/samuel_nas/my_hypermapper/scripts"
import hypermapper
%cd ".."
parameters_file = "example_scenarios/quick_start/example_resnet_scenario2.json"

t_start = time.perf_counter()

# HyperMapper runs the optimization procedure with MNIST_function as objective and parameters_file as Search Space
hypermapper.optimize(parameters_file, ResNet_function)
print('this entire procedure took ', (time.perf_counter() - t_start) / 60, 'minutes')


/Users/samuel
/Users/samuel/Documents/kurser/exjobb/samuel_nas/my_hypermapper/example_scenarios/quick_start
/Users/samuel
/Users/samuel/Documents/kurser/exjobb/samuel_nas/my_hypermapper/scripts
/Users/samuel/Documents/kurser/exjobb/samuel_nas/my_hypermapper
Design of experiment phase, number of doe samples = 10 .......




accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
accuracy:  1.0
n_filters,filter_upd,n_blocks1,n_blocks2,n_blocks3,n_blocks4,conv0,pool,reduce,Value,Timestamp
4,1.5,3,1,7,9,1,0,0,1,859
2,0.5001,1,0,0,18,0,1,0,1,1434
2,1.5,0,3,7,7,1,0,0,1,1973
12,0.5001,9,18,18,1,1,1,0,1,2593
2,0.5001,7,2,18,3,0,0,0,1,3413
2,0.5001,1,9,5,9,0,0,0,1,4007
24,1,18,18,18,9,0,1,0,1,5813
32,2,7,1,18,18,0,1,1,1,8998
4,1,18,1,9,2,0,0,0,1,10115
8,1.5,7,1,3,18,0,1,1,1,10723


End of doe phase, the number of new configuration runs is: 10

End of Random Scalarizations
### End of the hypermapper script.
this entire procedure took  0.17973203461539622 minutes
