In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
import pandas as pd
import time
import json

from itertools import product
from collections import namedtuple
from collections import OrderedDict
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

In [2]:
# TODO: sequential models, batch normalization

Data standardization is a specific type of normalization technique. It is sometimes referred to as z-score normalization. The z-score, a.k.a. standard score, is the transformed value for each data point.

To normalize a dataset using standardization, we take every value
inside the dataset and transform it to its correspondingvalue using the following formula:

z = (x - mean)/std

After performing this computation on every value inside our dataset, we have a new normalized dataset of values. The mean and standard deviation values are with respect to the dataset as a whole. 

<i>It's important to note that when we normalize a dataset, we typically group these operations by feature. This means that the mean and standard deviation values are relative to each feature set that's being normalized. If we are working with images, the features are the RGB color channels, so we normalize each color channel with respect to the mean and standard deviation values calculated across all pixels in every images for the respective color channel. In our case we only needs to
normalize a single color channel</i>

In [3]:
# Normalization typically occurs at the extraction and transform stages of the ETL process, we can pass the mean and std
# via the Normalize method as such:
# torchvision.transforms.Normalize(
#       [meanOfChannel1, meanOfChannel2, meanOfChannel3] 
#     , [stdOfChannel1, stdOfChannel2, stdOfChannel3] 
# )
# However, we dont have the mean and std of the channel we are working with and will need to calculate it

train_set = torchvision.datasets.FashionMNIST(
    root='/home/slabban/machine_learning_courses/datasets'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor()
    ]) 
)

Moving forward we will start implemention the 'num_workwers' in our dataloaders to increase the speed of our trainings. 
In a nutshell 'num_workers' specifies the amount of subprocesses can be used to read the data from disk while the main process runs.
From the deeplizard course, the biggest improvement came when 1 num workers was added, with diminishing returns as the number as increased.

This could be different for other cases, but we will stick to 1 num worker for the time being.

In [4]:
# If we are dealing with a dataset with a total size that our computer can handle in one run we can simply do this:

loader = DataLoader(train_set, batch_size=len(train_set), num_workers=1)
images, labels = next(iter(loader))
images.mean(), images.std()

(tensor(0.2860), tensor(0.3530))

In [5]:
# Often times we will be dealing with huge datasets, we to tackle that case by spliting the set into batches
# and implementing the mean and std formulas

loader = DataLoader(train_set, batch_size=1000, num_workers=1)

num_of_pixels = len(train_set) * 28 * 28
total_sum = 0
for images, labels in loader: total_sum += images.sum()
mean = total_sum / num_of_pixels

sum_of_squared_error = 0
for images, labels in loader: 
    sum_of_squared_error += ((images - mean).pow(2)).sum()
std = torch.sqrt(sum_of_squared_error / num_of_pixels)

mean, std

(tensor(0.2860), tensor(0.3530))

In [6]:
# Lets now inlcude normalization in our extract and transform steps:

train_set = torchvision.datasets.FashionMNIST(
    root='/home/slabban/machine_learning_courses/datasets'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)])
)

In [7]:
# The new output of this mean and std is 0 and 1 respectively 

loader = DataLoader(
      train_set
    , batch_size=len(train_set)
    , num_workers=1
)
data = next(iter(loader))
data[0].mean(), data[0].std()

(tensor(-9.3774e-08), tensor(1.))


PyTorch allows us to seamlessly move data to and from our GPU as we preform computations inside our programs.

When we go to the GPU, we can use the cuda() method, and when we go to the CPU, we can use the cpu() method.

We can also use the to() method. To go to the GPU, we write to('cuda') and to go to the CPU, we write to('cpu'). The to() method is the preferred way mainly because it is more flexible. We'll see one example using using the first two, and then we'll default to always using the to() variant.

CPU 	GPU
cpu() 	cuda()
to('cpu') 	to('cuda')

To make use of our GPU during the training process, there are two essential requirements. These requirements are as follows, the data must be moved to the GPU, and the network must be moved to the GPU.

    Data on the GPU
    Network on the GPU

In [8]:
# Lets pull in the very familiar RunManager implementation
# We are going to add a simple line at the 'add_graph' at of the tensorboard's 'Summary Writer' instance that will make our Run Manager class
# device Agnostic. we are using the getattr() built in function to get the value of the device on the run object. 
# If the run object doesn't have a device, then cpu is returned. This makes the code backward compatible. 
# It will still work if we don't specify a device for our run

# I will also add some flexibility to the class to allow us to disable tensorboard for file management

class RunManager():
    def __init__(self, tensorboard=False):
        # TODO: extract epoch && run variables into individual classes
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None

        self.istb = tensorboard
        self.tb = None

        self.tqdm_epoch = None

    def begin_run(self, run, network, loader):
        self.run_start_time = time.time()
        self.run_params = run
        self.run_count += 1

        self.network = network
        self.loader = loader
        images, labels = next(iter(self.loader))

        grid = torchvision.utils.make_grid(images)
        if(self.istb):
            self._create_tb(run, grid, images)

    def _create_tb(self, run, grid, images):
        self.tb = SummaryWriter(comment=f'-{run}')
        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network, images.to(getattr(run, 'device', 'cpu')))

    def end_run(self):
        if(self.istb):
            self._close_tb()
        self.epoch_count = 0
    
    def _close_tb(self):
        self.tb.close()
    
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.tqdm_epoch = tqdm(self.loader, unit="batch")
        
    
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss /len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        if(self.istb):
            self._plot_tb(loss, accuracy)
            

        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results['loss'] = loss
        results["accuracy"] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for key,val in self.run_params._asdict().items(): results[key] = val
        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data, orient='columns')

        clear_output(wait=True)
        display(df)

    def _plot_tb(self, loss, accuracy):
        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)
        
    def track_loss(self, loss, batch):
        self.epoch_loss += loss.item() * batch[0].shape[0]
    
    @torch.no_grad()
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    

    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):
        pd.DataFrame.from_dict(
            self.run_data, orient='columns'
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)
    
        

In [9]:
# Lets now also pull in our run builder class
class RunBuilder():
    @staticmethod
    def get_runs(params):
        # 
        Run = OrderedDict('Run', params.keys())

        runs = []
        for vals in product(*params.values()):
            runs.append(Run(*vals))
        return runs

In [10]:
# Lets also pull in our previous Network

# Lets build on our Network class by implementing the 'forward' method, which accepts and returns a tensor
# We dont actually call this method ourselves as it is called via the __call__ function in our instantiated layers
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
       t = self.conv1(t)
       t = F.relu(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)

       t = self.conv2(t)
       t = F.relu(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)

       t = t.reshape(-1, 12*4*4)
       t = self.fc1(t)
       t = F.relu(t)

       t = self.fc2(t)
       t = F.relu(t)

       t = self.out(t)

       return t

In [11]:
# Here is the updated implementation of the new training loop with our normalized data and GPU implementation
# where we pass the network to the device and each of the images and labels to the device on a per-batch basis

parameters = OrderedDict(
    lr = [.01]
    ,batch_size = [1000]
    , num_workers = [1]
    , device = ['cuda', 'cpu']
    , shuffle = [True, False]
    , epochs = [3]
)

manager = RunManager(tensorboard=False)

for run in RunBuilder.get_runs(parameters):
    device = torch.device(run.device)
    network = Network().to(device)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=run.batch_size, shuffle=run.shuffle)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    total_epochs = range(run.epochs)

    manager.begin_run(run, network, train_loader)
    for epoch in total_epochs:
        manager.begin_epoch()
        
        for batch in manager.tqdm_epoch:

            manager.tqdm_epoch.set_description(f"Epoch {manager.epoch_count} of {run.epochs}")
            
            images = batch[0].to(device)
            labels = batch[1].to(device)

            preds = network(images)
            loss = F.cross_entropy(preds, labels)
            optimizer.zero_grad()
            loss.backward() # calculate gradients
            optimizer.step() # update weights

            manager.track_loss(loss, batch)
            manager.track_num_correct(preds, labels)
        manager.end_epoch()
    manager.end_run()

# Commenting out to prevent file overcrowding
#manager.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers,device,shuffle,epochs
0,1,1,0.913796,0.65795,2.734332,2.780869,0.01,1000,1,cuda,True,3
1,1,2,0.489622,0.81395,2.257453,5.044611,0.01,1000,1,cuda,True,3
2,1,3,0.403654,0.85085,2.353287,7.402783,0.01,1000,1,cuda,True,3
3,2,1,0.824287,0.693233,2.29425,2.33308,0.01,1000,1,cuda,False,3
4,2,2,0.433806,0.8396,2.254014,4.591393,0.01,1000,1,cuda,False,3
5,2,3,0.372124,0.8613,2.821443,7.41697,0.01,1000,1,cuda,False,3
6,3,1,0.924748,0.649367,7.425347,7.559085,0.01,1000,1,cpu,True,3
7,3,2,0.467581,0.824517,5.560944,13.128758,0.01,1000,1,cpu,True,3
8,3,3,0.381183,0.857817,2.306813,15.441303,0.01,1000,1,cpu,True,3
9,4,1,0.887272,0.663933,2.267377,2.305693,0.01,1000,1,cpu,False,3


### Sequential Models

The Sequential class allows us to build PyTorch neural networks on-the-fly without having to build an explicit class. This make it much easier to rapidly build networks and allows us to skip over the step where we implement the forward() method. When we use the sequential way of building a PyTorch network, we construct the forward() method implicitly by defining our network's architecture sequentially.

A sequential module is a container or wrapper class that extends the nn.Module base class and allows us to compose modules together. We can compose any nn.Module with in any other nn.Module. 

We will go over the three ways that we can modify and create these sequential models, to avoid large text, we will use a simple network comprised of 2 fully connected layers, then convert our current network to a sequential model.

In [12]:
# Lets re-instantiate our normalized train set

train_set = torchvision.datasets.FashionMNIST(
    root='/home/slabban/machine_learning/machine_learning_courses/datasets'
    ,train=True
    ,download=True
    ,transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)])
)



torch.Size([1, 28, 28])

In [16]:
# lets define the inputs and outputs of our two layers, we use the pixel count at the input layer, reduce the output by half
# and use the labels as the final output layer
in_features = image.numel()
out_features = math.floor(in_features / 2)
out_classes = len(train_set.classes)

In [18]:
# Method 1, passing the layers directly to the constructor
network1 = nn.Sequential(
    nn.Flatten(start_dim=1)
    ,nn.Linear(in_features, out_features)
    ,nn.Linear(out_features, out_classes)
)
network1

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=392, bias=True)
  (2): Linear(in_features=392, out_features=10, bias=True)
)

In [20]:
# Method 2, we create and ordered dictionary and label the layer as a key, and pass the nn layer as the value
# This allows us to have labelled indices and some modularity, i.e, we can now pass different types of Models to the run builder!!

layers = OrderedDict([
    ('flat', nn.Flatten(start_dim=1))
    ,('hidden', nn.Linear(in_features, out_features))
    ,('output', nn.Linear(out_features, out_classes))
    ])

network2 = nn.Sequential(layers)
network2

Sequential(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (hidden): Linear(in_features=784, out_features=392, bias=True)
  (output): Linear(in_features=392, out_features=10, bias=True)
)

In [21]:
# Method 3, we use the add_module() method to add nn.Module instances to the network after it has already been initialized

network3 = nn.Sequential()
network3.add_module('flat', nn.Flatten(start_dim=1))
network3.add_module('hidden', nn.Linear(in_features, out_features))
network3.add_module('output', nn.Linear(out_features, out_classes))
network3

Sequential(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (hidden): Linear(in_features=784, out_features=392, bias=True)
  (output): Linear(in_features=392, out_features=10, bias=True)
)

In [22]:
# Lets quickly pull our infamous network down here for some ease of visibility
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
       t = self.conv1(t)
       t = F.relu(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)

       t = self.conv2(t)
       t = F.relu(t)
       t = F.max_pool2d(t, kernel_size =2, stride=2)

       t = t.reshape(-1, 12*4*4)
       t = self.fc1(t)
       t = F.relu(t)

       t = self.fc2(t)
       t = F.relu(t)

       t = self.out(t)

       return t

In [23]:
# Using method 1, lets create our equivalent sequential model

sequential = nn.Sequential(
      nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Flatten(start_dim=1)  
    , nn.Linear(in_features=12*4*4, out_features=120)
    , nn.ReLU()
    , nn.Linear(in_features=120, out_features=60)
    , nn.ReLU()
    , nn.Linear(in_features=60, out_features=10)
)

A pretty handy tool that we like to employ to control the randomness of our weight initialization is the 'torch.manual_seed' method, this allows us 
to set a value of the seed that our random number generator uses.

It important to note that this method will have to be called before each network is intatilized (before the layers are declared)

In [24]:
# Using a manual seed of 50 on our sequential model
torch.manual_seed(50)
sequential = nn.Sequential(
      nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
    , nn.ReLU()
    , nn.MaxPool2d(kernel_size=2, stride=2)
    , nn.Flatten(start_dim=1)  
    , nn.Linear(in_features=12*4*4, out_features=120)
    , nn.ReLU()
    , nn.Linear(in_features=120, out_features=60)
    , nn.ReLU()
    , nn.Linear(in_features=60, out_features=10)
)