<a href="https://colab.research.google.com/github/s0phia-/AI_challenge_day/blob/main/nondeterminism_in_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implications of Non-Determinism in Neural Network Optimisation

Description.

## Setup and Preparation

In [None]:
### Environment Setup ###

%pip install -q torch torchvision
%pip install matplotlib
%pip install numpy
%pip install tabulate

In [1]:
#### Load Data ###

import torch
import torchvision
import torchvision.transforms as transforms


transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:01<00:00, 106MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [2]:
### Define ResNet model ###

import torch.nn as nn
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


def conv3x3(in_channels, out_channels, stride=1):
  """3x3 convolution with padding"""
  return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride,
      padding=1, bias=False)


class BasicBlock(nn.Module):
  expansion = 1

  def __init__(self, inplanes, planes, stride=1, downsample=None):
    super(BasicBlock, self).__init__()
    # Both self.conv1 and self.downsample layers downsample the input when
    # stride != 1
    self.conv1 = conv3x3(inplanes, planes, stride)
    self.bn1 = nn.BatchNorm2d(planes)
    self.relu = nn.ReLU(inplace=True)
    self.conv2 = conv3x3(planes, planes)
    self.bn2 = nn.BatchNorm2d(planes)
    self.downsample = downsample

  def forward(self, x):
    identity = x

    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)

    if self.downsample is not None:
      identity = self.downsample(x)
      identity = torch.cat((identity, torch.zeros_like(identity)), 1)

    out += identity
    out = self.relu(out)

    return out


class ResNet(nn.Module):

  def __init__(self, block, layers, channels_per_block=None, num_classes=10):
    super(ResNet, self).__init__()

    if channels_per_block is None:
      channels_per_block = [16, 32, 64]
    if len(layers) != len(channels_per_block):
      raise ValueError('number of layers and channels per block must be equal')
    self.num_layers = sum(layers)
    self.inplanes = channels_per_block[0]
    self.conv1 = conv3x3(3, channels_per_block[0])
    self.bn1 = nn.BatchNorm2d(channels_per_block[0])
    self.relu = nn.ReLU(inplace=True)
    self.layers = []
    for i, (num_channels, num_blocks) in enumerate(zip(channels_per_block,
                                                       layers)):
      self.layers.append(self._make_layer(block, num_channels, num_blocks,
                                          stride=(1 if i == 0 else 2)))
      self.add_module('layer%d' % i, self.layers[-1])

    self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
    self.fc = nn.Linear(channels_per_block[-1], num_classes)

    for _, m in sorted(self.named_modules()):
      if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
      elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

    # Zero-initialize the last BN in each residual branch so that the residual
    # branch starts with zeros, and each residual block behaves like an
    # identity. This improves the model by 0.2~0.3% according to
    # https://arxiv.org/abs/1706.02677
    for _, m in sorted(self.named_modules()):
      if isinstance(m, BasicBlock):
        nn.init.constant_(m.bn2.weight, 0)

  def _make_layer(self, block, planes, blocks, stride=1):
    downsample = None
    if stride != 1:
      downsample = nn.Sequential(
        nn.AvgPool2d(1, stride=stride),
        nn.BatchNorm2d(self.inplanes),
      )

    layers = []
    layers.append(block(self.inplanes, planes, stride, downsample))
    self.inplanes = planes
    for _ in range(1, blocks):
      layers.append(block(planes, planes))

    return nn.Sequential(*layers)

  def forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)

    for layer in self.layers:
      x = layer(x)

    x = self.avgpool(x)
    x = x.view(x.size(0), -1)
    x = self.fc(x)

    return x

In [None]:
### Helper functions ###

import matplotlib.pyplot as plt
import numpy as np

# function to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

def visualise_model(model, num_images=4):
    """
    Visualizes the model's predictions on a few test images.
    """

    # get test images
    dataiter = iter(testloader)
    images, labels = next(dataiter)

    # print images
    imshow(torchvision.utils.make_grid(images))

    # show ground truth labels
    print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' 
                                    for j in range(num_images)))

    # show predicted labels
    outputs = model(images)
    _, predicted = torch.max(outputs, 1)
    print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                                for j in range(num_images)))
    
def train(model, epochs=1000):
    """
    Training loop using fixed hyper-parameters. Returns the trained model. 
    """

    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    for epoch in range(epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = nn.CrossEntropyLoss()(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    print('Finished Training')

    # PATH = './cifar_net.pth'
    # torch.save(model.state_dict(), PATH)

    return model

### Create model

In [3]:
model = ResNet(BasicBlock, [2, 2, 2], [16, 32, 64]).to(device)

### optional: print model ###
# print(model)

Using cpu device


## Non-Determinisism Functions

### Parameter Initialisation

*"When training a model, parameters without preset values are initialized randomly according to a given distribution, e.g. a zero-mean Gaussian with variance determined by the number of input connections to the layer (Glorot & Bengio, 2010; He et al., 2015)".*

In [None]:
# todo: control random initialisation of parameters

def param_init(seed):
    pass

### Data Shuffling

*"In stochastic gradient descent, the gradient is approximated on a random subset of examples, commonly implemented by using small batches of data iteratively in a shuffled training dataset (Bottou, 2012). Shuffling may happen either once, before training, or in between each
epoch of training, the variant we use in this work".*

In [None]:
# todo: control randomness in data shuffling

def data_shuffle():
    pass

### Data Augmentation

*"A common practice, data augmentation refers to randomly altering each training example to artificially expand the training dataset (Shorten & Khoshgoftaar, 2019). For example, randomly flipping images encourages invariance to left/right orientation".*

In [None]:
# todo: control randomness in data augmentation

def data_augmentation():
    pass

### Stochastic Regularisation

*"Some types of regularization, such as Dropout (Srivastava et al., 2014), take the form of stochastic operations internal to a model during training. Other instances of this include DropConnect (Wan et al., 2013) and variable length backpropagation through time (Merity et al., 2017), among many others."*

In [None]:
# 

def stochastic_operations():
    pass

### Low-level Operations (cuDNN)

*"Often underlooked, many libraries that deep learning frameworks are built on, such as cuDNN (Chetlur et al., 2014), typically run nondeterministically in order to increase the speed of their operations. This nondeterminism is small when evaluated in the context of a single operation — in one test we performed it caused an output difference of 0.003%. In the case of cuDNN, the library we test, it is possible to disable nondeterministic behavior at a speed penalty on the order of ∼15%. However, unlike other nondeterminism sources, it is not possible to “seed” this; it is only possible to turn it on or off."*

In [None]:
# todo: seed cudnn

def cuDNN():
    pass

In [None]:
# todo: change 1 bit in network

def one_bit_param_init():
    pass

In [None]:

def control_non_determinism():
    pass

## Evaluation

In [6]:
def pairwise_diagreement(model1, model2):
    pass

In [None]:
### Train and test loop (outline only) ###

def non_determinism_experiment(*args):

    models = []

    for seed in range(1, 101):

        # Control for different aspects of non-determinism
        model = control_non_determinism(seed)

        # Train model
        model = train(model)

        # Save model for evaluation
        models.append(model)

    # Evaluate models


    return accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement


    

In [None]:
### Recreating the table from the paper ###

from tabulate import tabulate

# Run experiments and collect results
results = []

# Parameter initialisation
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('param_init')
results.append(["Parameter Initialization", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Data shuffling
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('data_shuffle')
results.append(["Data Shuffling", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Data augmentation
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('data_augmentation')
results.append(["Data Augmentation", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# cuDNN
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('cuDNN')
results.append(["cuDNN", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Data shuffling + cuDNN
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('data_shuffle', 'cuDNN')
results.append(["Data Shuffling + cuDNN", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Data shuffling + data augmentation + cuDNN
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('data_shuffle', 'data_augmentation', 'cuDNN')
results.append(["Data Shuffling + Aug. + cuDNN", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Parameter initialisation + data shuffling + data augmentation + cuDNN 
accuracy_mean, accuracy_std, cross_entropy_mean, cross_entropy_std, pairwise_disagreement = non_determinism_experiment('param_init', 'data_shuffle', 'data_augmentation', 'cuDNN')
results.append(["All Nondeterminism Sources", 
                f"{accuracy_mean:.2f} ± {accuracy_std:.2f}", 
                f"{cross_entropy_mean:.4f} ± {cross_entropy_std:.4f}", 
                f"{pairwise_disagreement:.1f}"])

# Print the table
headers = ["Nondeterminism Source", "Accuracy SD (%)", "Cross-Entropy SD", "Pairwise Disagree (%)"]
print(tabulate(results, headers=headers, tablefmt="grid"))

In [None]:
### Additional sources of non-determinism or combinations ###