# Importing dataset and common libraries

In [1]:
import os
import random

import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.auto import tqdm

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# remove slow mirror from list of MNIST mirrors
torchvision.datasets.MNIST.mirrors = [mirror for mirror in torchvision.datasets.MNIST.mirrors
                                      if not mirror.startswith("http://yann.lecun.com")]

In [2]:
#Installing WB
!pip install wandb -Uq

[K     |████████████████████████████████| 1.9 MB 4.6 MB/s 
[K     |████████████████████████████████| 184 kB 56.4 MB/s 
[K     |████████████████████████████████| 174 kB 66.6 MB/s 
[K     |████████████████████████████████| 62 kB 1.2 MB/s 
[K     |████████████████████████████████| 173 kB 73.0 MB/s 
[K     |████████████████████████████████| 168 kB 75.2 MB/s 
[K     |████████████████████████████████| 168 kB 45.7 MB/s 
[K     |████████████████████████████████| 166 kB 61.4 MB/s 
[K     |████████████████████████████████| 166 kB 53.3 MB/s 
[K     |████████████████████████████████| 162 kB 15.7 MB/s 
[K     |████████████████████████████████| 162 kB 69.4 MB/s 
[K     |████████████████████████████████| 158 kB 55.8 MB/s 
[K     |████████████████████████████████| 157 kB 53.2 MB/s 
[K     |████████████████████████████████| 157 kB 71.1 MB/s 
[K     |████████████████████████████████| 157 kB 60.7 MB/s 
[K     |████████████████████████████████| 157 kB 56.7 MB/s 
[K     |██████████████████

In [3]:
#Importing and login to Wandb
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Define the Pipeline

Define the hyperparams

In [4]:
config = dict(
    epochs=5,
    classes=10,
    kernels=[16, 32],
    batch_size=128,
    learning_rate=0.005,
    dataset="MNIST",
    architecture="CNN")

Model Pipeline

In [5]:
def model_pipeline(hyperparam):
  #wandb start
  with wandb.init(project="wandb-demo", config=hyperparam):
    #to make the hyperparams we use for our model and the ones that get logged are the same
    config = wandb.config

    #getting the stuffs required for demo like model, train and test data, loss criteria and optimizer
    model, train_loader, test_loader, criterion, optimizer = make(config)
    print(model)

    #train the model
    train(model, train_loader, criterion, optimizer, config)

    #test the model
    test(model, test_loader)
  
  return model

In [6]:
from torchvision.datasets.rendered_sst2 import make_dataset
def make(config):

  #getting the data
  train, test = get_data(train=True), get_data(train=False)
  train_loader = make_loader(train, batch_size=config.batch_size)
  test_loader = make_loader(test, batch_size=config.batch_size)

  #making the model
  model = ConvNet(config.kernels, config.classes).to(device)

  #set the loss criteria and the type of optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr = config.learning_rate)

  return model, train_loader, test_loader, criterion ,optimizer

#Load the Data

In [7]:
def get_data(slice = 5, train=True):
  dataset = torchvision.datasets.MNIST(root='.',
                                       train = train,
                                       transform = transforms.ToTensor(),
                                       download=True)
  sub_dataset = torch.utils.data.Subset(dataset, indices = range(0, len(dataset),slice))

  return sub_dataset

def make_loader(dataset, batch_size):
  loader = torch.utils.data.DataLoader(dataset=dataset, 
                                       batch_size=batch_size,
                                       shuffle=True,
                                       pin_memory=True, 
                                       num_workers=2)
  
  return loader

#Defining the CNN Model

In [8]:
class ConvNet(nn.Module):
  def __init__(self, kernels, classes=10):
    super(ConvNet, self).__init__()

    self.layer1 = nn.Sequential( nn.Conv2d(1, kernels[0], kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
    self.layer2 = nn.Sequential(
            nn.Conv2d(16, kernels[1], kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
    
    self.fc = nn.Linear(7*7*kernels[-1], classes)

  def forward(self, x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = out.reshape(out.size(0), -1)
    out = self.fc(out)
    return out

# Training LOOP to train the model

In [9]:
# It basically used wandb.watch and wandb.log

def train(model, loader, criterion, optimizer, config):
  # It will bascially log the gradients and model params every log_freq of the training 
  wandb.watch(model, criterion, log="all", log_freq=10)

  total_batches = len(loader) * config.epochs
  #examples seen
  example_ct = 0
  batch_ct = 0

  for epoch in tqdm(range(config.epochs)):
    for _,(images, labels) in enumerate(loader):
      loss = train_batch(images, labels, model, optimizer, criterion)
      example_ct += len(images)
      batch_ct += 1
    
      #report the metric every 25th batch
      if( (batch_ct + 1) % 25 == 0):
        train_log(loss, example_ct, epoch)
    

def train_batch(images, labels, model, optimizer, criterion):
  images, labels = images.to(device) , labels.to(device)

  #fwd propogation
  outputs = model(images)
  loss = criterion(outputs, labels)

  #bwd pass
  optimizer.zero_grad()
  loss.backward()

  #step with optimizer
  optimizer.step()

  return loss


We log the loss of a particular batch in a certain epoch rather than printing it using wandb.log

In [10]:
def train_log(loss, example_ct, epoch):
  wandb.log({"epoch": epoch, "loss":loss }, step=example_ct)
  print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")

# Defining the test logic

We will save our model using wandb.save that will save the model architecture and parameters

In [11]:
def test(model, test_data):
  model.eval()

  with torch.no_grad():
    correct, total = 0, 0

    for images, labels in test_data:
      images, labels = images.to(device), labels.to(device)
      outputs = model(images)
      _, predictions = torch.max(outputs.data, 1)
      total += labels.size(0)

      correct += (predictions== labels).sum().item()

      print(f"Accuracy of the model on the {total} " +
              f"test images: {correct / total:%}")
      
      wandb.log({"test_accuracy": correct / total})

  
  # saving the model in the onx format
  torch.onnx.export(model, images, "model.onnx")
  wandb.save("model.onnx")

# RUNNING THE WHOLE PIPELINE

In [12]:
model = model_pipeline(config)

[34m[1mwandb[0m: Currently logged in as: [33mlazy_bit[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=1568, out_features=10, bias=True)
)


  0%|          | 0/5 [00:00<?, ?it/s]

Loss after 03072 examples: 0.449
Loss after 06272 examples: 0.243
Loss after 09472 examples: 0.126
Loss after 12640 examples: 0.169
Loss after 15840 examples: 0.064
Loss after 19040 examples: 0.148
Loss after 22240 examples: 0.144
Loss after 25408 examples: 0.055
Loss after 28608 examples: 0.087
Loss after 31808 examples: 0.061
Loss after 35008 examples: 0.062
Loss after 38176 examples: 0.098
Loss after 41376 examples: 0.030
Loss after 44576 examples: 0.007
Loss after 47776 examples: 0.025
Loss after 50944 examples: 0.020
Loss after 54144 examples: 0.060
Loss after 57344 examples: 0.026
Accuracy of the model on the 128 test images: 97.656250%
Accuracy of the model on the 256 test images: 97.265625%
Accuracy of the model on the 384 test images: 97.135417%
Accuracy of the model on the 512 test images: 97.265625%
Accuracy of the model on the 640 test images: 97.343750%
Accuracy of the model on the 768 test images: 97.526042%
Accuracy of the model on the 896 test images: 97.544643%
Accurac

0,1
epoch,▁▁▁▃▃▃▃▅▅▅▅▆▆▆▆███
loss,█▅▃▄▂▃▃▂▂▂▂▂▁▁▁▁▂▁
test_accuracy,▆▂▁▂▃▅▅▄▆▄▆▅▆▆▇█

0,1
epoch,4.0
loss,0.02635
test_accuracy,0.978
