In [4]:
# import libraries
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler

import numpy as np

from torch.utils.tensorboard import SummaryWriter
from utils import device, get_num_correct, RunBuilder
from network import Network

In [5]:
# covertes to tensor and normalizes the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])

train_set = torchvision.datasets.CIFAR10(
    root='./data/',
    train=True,
    download=True,
    transform=transform
)
test_set = torchvision.datasets.CIFAR10(
    root='./data/',
    train=False,
    download=True,
    transform=transform
)

# load the test set
test_loader = torch.utils.data.DataLoader(test_set, batch_size=64, num_workers=1)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data/
Files already downloaded and verified


Before starting the training process, it is often a best practice to try and overfit a single batch of data, so to confirm that the network is implemented correctly and it has the capability to be used as the model for training.

In [6]:
# load the train_set for trying out the model
loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True, num_workers=1)

model = Network()  # initialize the NN
criterion = nn.CrossEntropyLoss()  # loss function (categorical cross-entropy)
optimizer = optim.Adam(model.parameters(), lr=0.01)  # specify the optimizer
images, labels = next(iter(loader))  # load one batch of train_set

for epoch in range(50):
    correct = 0  # will be used to track the running num correct
    preds = model(images)  # forward pass
    loss = criterion(preds, labels)  # calculate loss
    optimizer.zero_grad()  # clear accumulated gradients from the previous pass
    loss.backward()  # backward pass
    optimizer.step()  # perform a single optimization step
    correct += get_num_correct(preds, labels)  # update running num correct

    # print statistics
    print(f'epoch: {epoch+1:2d}\tloss:{loss.item():2.4f}\tacc:{(correct/32):2.4f}')

epoch:  1	loss:2.3374	acc:0.0625
epoch:  2	loss:7.6343	acc:0.1875
epoch:  3	loss:6.3814	acc:0.1562
epoch:  4	loss:4.1736	acc:0.1250
epoch:  5	loss:2.8999	acc:0.1875
epoch:  6	loss:2.3920	acc:0.1875
epoch:  7	loss:2.1777	acc:0.1875
epoch:  8	loss:2.0633	acc:0.3750
epoch:  9	loss:1.9401	acc:0.3125
epoch: 10	loss:1.8869	acc:0.3438
epoch: 11	loss:1.9343	acc:0.3438
epoch: 12	loss:1.7509	acc:0.4062
epoch: 13	loss:1.7444	acc:0.3125
epoch: 14	loss:1.6313	acc:0.4688
epoch: 15	loss:1.6949	acc:0.3438
epoch: 16	loss:1.5455	acc:0.4062
epoch: 17	loss:1.4715	acc:0.4375
epoch: 18	loss:1.4672	acc:0.4688
epoch: 19	loss:1.2785	acc:0.5312
epoch: 20	loss:1.1871	acc:0.5938
epoch: 21	loss:1.2549	acc:0.4062
epoch: 22	loss:1.1221	acc:0.5312
epoch: 23	loss:0.8853	acc:0.6875
epoch: 24	loss:0.8454	acc:0.6875
epoch: 25	loss:0.7457	acc:0.7812
epoch: 26	loss:0.6501	acc:0.7500
epoch: 27	loss:0.7226	acc:0.6875
epoch: 28	loss:0.7279	acc:0.6875
epoch: 29	loss:0.5334	acc:0.8438
epoch: 30	loss:0.4670	acc:0.8438
epoch: 31	

As we can see, the model is overfitting which means the network implementation is correct! Now we can continue with our training.

In [6]:
valid_size = 0.2  # percentage of train_set to use it as validation

# obtain training indices that will be used for validation
num_train = len(train_set)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]

# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

In [7]:
# for hyper-parameter search
from collections import OrderedDict

params = OrderedDict(
    lr = [0.01, 0.003, 0.001],
    batch_size = [64, 128, 256, 512]
)

In [8]:
criterion = nn.CrossEntropyLoss()  # loss function (categorical cross-entropy)

# iterate through the cross product of hyper-parameters defined in params
for run in RunBuilder.get_runs(params):
    print(f'\n{run}')
    model = Network().to(device)  # initialize the NN
    # load the train set
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=run.batch_size,
        sampler=train_sampler,
        num_workers=1
    )
    # load the validation set
    valid_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=run.batch_size,
        sampler=valid_sampler,
        num_workers=1
    )
    optimizer = optim.Adam(model.parameters(), lr=run.lr)  # specify the optimizer

    comment = f'-{run}'  # will be used for naming the runs based on each run's hyper-parameters
    tb = SummaryWriter(comment=comment)

    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf  # set initial minimum to infinity

    num_epochs = 30  # number of epochs used for training
    for epoch in range(num_epochs):
        train_loss, train_correct = 0, 0  # wil be used to track the running loss and correct
        ###################
        # train the model #
        ###################
        model.train()  # set the model to train mode
        for batch in train_loader:
            images, labels = batch[0].to(device), batch[1].to(device)  # load the batch to the available device (cpu/gpu)
            preds = model(images)  # forward pass
            loss = criterion(preds, labels)  # calculate loss
            optimizer.zero_grad()  # clear accumulated gradients from the previous pass
            loss.backward()  # backward pass
            optimizer.step()  # perform a single optimization step

            train_loss += loss.item() * run.batch_size  # update the running loss
            train_correct += get_num_correct(preds, labels)  # update running num correct

        tb.add_scalar('Train Loss', train_loss, epoch)  # add train_loss for the current epoch to tensorboard
        tb.add_scalar('Train Accuracy', train_correct/len(train_loader.sampler), epoch)

        model.eval()  # set the model to evaluation mode
        with torch.no_grad():  # turn off grad tracking, as we don't need gradients for validation
            valid_loss, valid_correct = 0, 0  # will be used to track the running validation loss and correct
            ######################
            # validate the model #
            ######################
            for batch in valid_loader:
                images, labels = batch[0].to(device), batch[1].to(device)  # load the batch to the available device
                preds = model(images)  # forward pass
                loss = criterion(preds, labels)  # calculate the loss  

                valid_loss += loss.item() * run.batch_size  # update the running loss
                valid_correct += get_num_correct(preds, labels)  # update running num correct

            tb.add_scalar('Validation Loss', valid_loss, epoch)  # add valid_loss for the current epoch to tensorboard
            tb.add_scalar('Validation Accuracy', valid_correct/len(valid_loader.sampler), epoch)

            # print training/validation statistics
            # calculate average loss over an epoch
            train_loss = train_loss/len(train_loader.sampler)
            valid_loss = valid_loss/len(valid_loader.sampler)
            print(f'Epoch {epoch+1:2d}: Training Loss: {train_loss:.6f} Validation Loss: {valid_loss:.6f}')

            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print(f'\t  valid_loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f})  saving model...')
                torch.save(model.state_dict(), f'./models/model-{run}.ckpt')
                valid_loss_min = valid_loss

            # load the model with least validation loss i.e., load the one which was saved most recently
            model.load_state_dict(
                torch.load(
                    f'./models/model-{run}.ckpt',
                    map_location=device
                )
            )

            test_loss, test_correct = 0, 0  # will be used to track the running test loss and correct
            ##################
            # test the model #
            ##################
            for batch in test_loader:
                images, labels = batch[0].to(device), batch[1].to(device)  # load the batch to the available device
                preds = model(images)  # forward pass
                loss = criterion(preds, labels)  # calculate the loss

                test_loss += loss.item() * 64  # update the running loss
                test_correct += get_num_correct(preds, labels)  # update running num correct

            tb.add_scalar('Test Loss', test_loss, epoch)
            tb.add_scalar('Test Accuracy', test_correct/len(test_set), epoch)

        # iterate the parameters' weights and it's grads and plot their historgrams to tensorboard
        # (will be helpful for checking if the model is having the vanishing gradient problem)
        for name, weight in model.named_parameters():
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram(f'{name}.grad', weight.grad, epoch)



run(lr=0.01, batch_size=64)
Epoch  1: Training Loss: 1.767838 Validation Loss: 1.437015
	  valid_loss decreased (inf --> 1.437015)  saving model...
Epoch  2: Training Loss: 1.430668 Validation Loss: 1.304457
	  valid_loss decreased (1.437015 --> 1.304457)  saving model...
Epoch  3: Training Loss: 1.287644 Validation Loss: 1.165301
	  valid_loss decreased (1.304457 --> 1.165301)  saving model...
Epoch  4: Training Loss: 1.204075 Validation Loss: 1.064403
	  valid_loss decreased (1.165301 --> 1.064403)  saving model...
Epoch  5: Training Loss: 1.136221 Validation Loss: 1.072901
Epoch  6: Training Loss: 1.138143 Validation Loss: 1.030662
	  valid_loss decreased (1.064403 --> 1.030662)  saving model...
Epoch  7: Training Loss: 1.091750 Validation Loss: 1.009098
	  valid_loss decreased (1.030662 --> 1.009098)  saving model...
Epoch  8: Training Loss: 1.045550 Validation Loss: 0.990777
	  valid_loss decreased (1.009098 --> 0.990777)  saving model...
Epoch  9: Training Loss: 1.016373 Validat