# CNN Training Loop Refactoring(Simultaneous Hyperparameter Testing)

When we [last](https://saptarshidatta.in/2020/10/06/PyTorch_CIFAR10_TB.html) trained our network, we built out quite a lot of functionality that allowed us to experiment with many different parameters and values, and we also made the calls need inside our training loop that would get our results into TensorBoard.

All of this work has helped, but our training loop is quite crowded now. In this exercise, we're going to clean up our training loop and set the stage for more experimentation up by using the `RunBuilder` class that we built last time and by building a new class called `RunManager`.

I also find this way of Hyperparameter Tuning more intuitive than TensorBoard. Also, as our number of parameters and runs get larger, TensorBoard will start to breakdown as a viable solution for reviewing our results.

However, calls have been made inside our `RunManager` class to TensorBoard, so it can be used as an added functionality. For reference, on how to use TensorBoard with PyTorch inside Google Collab, plese refer [here](https://saptarshidatta.in/2020/10/06/PyTorch_CIFAR10_TB.html).

The code also generates results in `csv` and `json` format, which can be used gor further analysis.

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
import pandas as pd
import time
import json

from itertools import product
from collections import namedtuple
from collections import OrderedDict

## Designing the Neural Network

In [8]:
class Network(nn.Module):
  def __init__(self):
    super(Network,self).__init__()
    self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)
    self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)

    self.fc1 = nn.Linear(in_features=16*5*5, out_features=120)
    self.fc2 = nn.Linear(in_features=120, out_features=84)
    self.out = nn.Linear(in_features=84, out_features=10)

  def forward(self, t):
    #Layer 1
    t = t
    #Layer 2
    t = self.conv1(t)
    t = F.relu(t)
    t = F.max_pool2d(t, kernel_size=2, stride=2)#output shape : (6,14,14)
    #Layer 3
    t = self.conv2(t)
    t = F.relu(t)
    t = F.max_pool2d(t, kernel_size=2, stride=2)#output shape : (16,5,5)
    #Layer 4
    t = t.reshape(-1, 16*5*5)
    t = self.fc1(t)
    t = F.relu(t)#output shape : (1,120)
    #Layer 5
    t = self.fc2(t)
    t = F.relu(t)#output shape : (1, 84)
    #Layer 6/ Output Layer
    t = self.out(t)#output shape : (1,10)

    return t

## `RunBuilder` class

In [9]:
class RunBuilder():
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

## `RunManager` class

In [10]:
class RunManager():
    def __init__(self):
        
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None
        self.loader = None
        self.tb = None
        
    def begin_run(self, run, network, loader):
        
        self.run_start_time = time.time()

        self.run_params = run
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')
        
        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images', grid)
        self.tb.add_graph(
             self.network
            ,images.to(getattr(run, 'device', 'cpu'))
        )
        
    def end_run(self):
        self.tb.close()
        self.epoch_count = 0   

    def begin_epoch(self):
        self.epoch_start_time = time.time()
        
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):
        
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)
                
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)
        
        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
        
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results['loss'] = loss
        results["accuracy"] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        
        clear_output(wait=True)
        display(df)
        
    def track_loss(self, loss, batch):
        self.epoch_loss += loss.item() * batch[0].shape[0]
        
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        
        pd.DataFrame.from_dict(
            self.run_data
            ,orient='columns'
        ).to_csv(f'{fileName}.csv')
        
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

## Loading the CIFAR-10 data and pre-processing

In [11]:
train_set = torchvision.datasets.CIFAR10(
    root='./data'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

Files already downloaded and verified


## Training the Nueral Network

In [13]:
params = OrderedDict(
    lr = [.01, 0.001]
    ,batch_size = [100, 1000]
    ,shuffle = [True]
)
m = RunManager()
for run in RunBuilder.get_runs(params):

    network = Network()
    loader = DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    m.begin_run(run, network, loader)
    for epoch in range(5):
        m.begin_epoch()
        for batch in loader:
            
            images, labels = batch
            preds = network(images) # Pass Batch
            loss = F.cross_entropy(preds, labels) # Calculate Loss
            optimizer.zero_grad() # Zero Gradients
            loss.backward() # Calculate Gradients
            optimizer.step() # Update Weights
            
            m.track_loss(loss, batch)
            m.track_num_correct(preds, labels)  
        m.end_epoch()
    m.end_run()
m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,shuffle
0,1,1,1.766828,0.33092,16.010722,16.232718,0.01,100,True
1,1,2,1.519231,0.44906,16.379714,32.724761,0.01,100,True
2,1,3,1.456742,0.47634,16.447227,49.278241,0.01,100,True
3,1,4,1.433824,0.48748,16.522049,65.909568,0.01,100,True
4,1,5,1.387163,0.50962,16.549969,82.574042,0.01,100,True
5,2,1,2.154447,0.18388,14.927786,16.515787,0.01,1000,True
6,2,2,1.863616,0.30586,15.019241,31.637032,0.01,1000,True
7,2,3,1.653237,0.38942,14.963688,46.707006,0.01,1000,True
8,2,4,1.523028,0.44138,15.089503,61.910706,0.01,1000,True
9,2,5,1.445244,0.47474,15.259337,77.289368,0.01,1000,True
