# **mnist_convnet**
- ELEC 576 HW 1
- Robert Heeter
- 4 October 2023

## **Structure**:
1) Set PyTorch metadata
    - Seed
    - TensorFlow output
    - Whether to transfer to gpu (cuda)
2) Import data
    - Download data
    - Create data loaders with batchsize, transforms, scaling
3) Define model architecture, loss, and optimizer
4) Define test and training loops
    - Train:
        - Get next batch
        - Forward pass through model-
        - Calculate loss
        - Backward pass from loss (calculates the gradient for each parameter)
        - Optimizer: performs weight updates
5) Perform training over multiple epochs
    - Each epoch:
        - Call train loop
        - Call test loop

## **Acknowledgements**:
- https://colab.research.google.com/drive/1i9KpbQyFU4zfq8zLLns8a2Kd8PRMGsaZ
- https://github.com/motokimura/pytorch_tensorboard/blob/master/main.py
- https://colab.research.google.com/github/tensorflow/tensorboard/blob/master/docs/tensorboard_in_notebooks.ipynb#scrollTo=lpUO9HqUKP6z
- https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-tensorboard-with-pytorch.md


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import os

%load_ext tensorboard


2023-10-06 01:46:54.245715: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 1. Set PyTorch metadata

batch_size = 64
test_batch_size = 1000
epochs = 10
lr = 0.01
try_cuda = True
seed = 1000
logging_interval = 10 # how many batches to wait before logging
logging_dir = None

# setting up the logging
log_dir = os.path.join(os.getcwd(),'log/MNIST', datetime.now().strftime('%b%d_%H-%M-%S'))
writer = SummaryWriter(log_dir=log_dir)

# deciding whether to send to the cpu or not if available
if torch.cuda.is_available() and try_cuda:
    cuda = True
    torch.cuda.mnaual_seed(seed)
else:
    cuda = False
    torch.manual_seed(seed)
    

In [3]:
# 2. Import data

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.01307,), (0.3081,))])

train_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=True, download=True, transform=transform),
                                           batch_size=batch_size,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=False, download=True, transform=transform),
                                          batch_size=test_batch_size,
                                          shuffle=True)


In [4]:
# 3. Defining model architecture, loss, and optimizer

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.layers = nn.Sequential(nn.Conv2d(1, 10, kernel_size=5),
                                    nn.Tanh(),
                                    nn.MaxPool2d(2),
                                    nn.Conv2d(10, 20, kernel_size=5),
                                    nn.Tanh(),
                                    nn.MaxPool2d(2),
                                    nn.Flatten(),
                                    nn.Linear(320, 50),
                                    nn.Tanh(),
                                    nn.Dropout2d(0.5),
                                    nn.Linear(50, 10),
                                    nn.Softmax(dim=1)
                                   )
        
    def forward(self, x):
        '''Forward pass'''
        return self.layers(x)


#         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
#         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
#         self.conv2_drop = nn.Dropout2d()
#         self.fc1 = nn.Linear(320, 50)
#         self.fc2 = nn.Linear(50, 10)

#     def forward(self, x):
#         # original network architecture
#         # x = F.relu(F.max_pool2d(self.conv1(x), 2))
#         # x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
#         # x = x.view(-1, 320) # (batch_size, units)
#         # x = F.relu(self.fc1(x))
#         # x = F.dropout(x, training=self.training)
#         # x = self.fc2(x)
#         # x = F.softmax(x, dim=1)

#         # new network architecture
#         x = self.conv1(x)
#         x = F.relu(x)
#         x = F.max_pool2d(x, 2)
#         x = self.conv2(x)
#         x = F.relu(x)
#         x = F.max_pool2d(x, 2)
#         x = x.view(-1, 320)
#         x = self.fc1(x)
#         x = F.relu(x)
#         x = F.dropout(x, p=0.5)
#         x = self.fc2(x)
#         x = F.softmax(x, dim=1)

#         return x

model = Net()
print(model)

optimizer = optim.SGD(model.parameters(), lr=lr)
print(optimizer)


Net(
  (layers): Sequential(
    (0): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
    (1): Tanh()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
    (4): Tanh()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=320, out_features=50, bias=True)
    (8): Tanh()
    (9): Dropout2d(p=0.5, inplace=False)
    (10): Linear(in_features=50, out_features=10, bias=True)
    (11): Softmax(dim=1)
  )
)
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)


In [5]:
def weight_histograms_conv2d(writer, n_iter, weights, biases, layer_number):
    weights_shape = weights.shape
    num_kernels = weights_shape[0]
    
    for k in range(num_kernels):
        
        flattened_weights = weights[k].flatten()
        tag = f"layer_{layer_number}/kernel_{k}_weight_histogram"
        writer.add_histogram(tag, flattened_weights, n_iter, bins='tensorflow')
        
        tag = f"layer_{layer_number}/kernel_{k}_weight_min"
        writer.add_scalar(tag, torch.min(flattened_weights), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_weight_max"
        writer.add_scalar(tag, torch.max(flattened_weights), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_weight_mean"
        writer.add_scalar(tag, torch.mean(flattened_weights), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_weight_stdev"
        writer.add_scalar(tag, torch.std(flattened_weights), n_iter)
        
        flattened_biases = biases[k].flatten()
        tag = f"layer_{layer_number}/kernel_{k}_biases_histogram"
        writer.add_histogram(tag, flattened_biases, n_iter, bins='tensorflow')
        
        tag = f"layer_{layer_number}/kernel_{k}_biases_min"
        writer.add_scalar(tag, torch.min(flattened_biases), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_biases_max"
        writer.add_scalar(tag, torch.max(flattened_biases), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_biases_mean"
        writer.add_scalar(tag, torch.mean(flattened_biases), n_iter)
        tag = f"layer_{layer_number}/kernel_{k}_biases_stdev"
        writer.add_scalar(tag, torch.std(flattened_biases), n_iter)
        
        break
        
def weight_histograms_linear(writer, n_iter, weights, biases, layer_number):
    flattened_weights = weights.flatten()
    tag = f"layer_{layer_number}_weight_histogram"
    writer.add_histogram(tag, flattened_weights, n_iter, bins='tensorflow')
    
    tag = f"layer_{layer_number}_weight_min"
    writer.add_scalar(tag, torch.min(flattened_weights), n_iter)
    tag = f"layer_{layer_number}_weight_max"
    writer.add_scalar(tag, torch.max(flattened_weights), n_iter)
    tag = f"layer_{layer_number}_weight_mean"
    writer.add_scalar(tag, torch.mean(flattened_weights), n_iter)
    tag = f"layer_{layer_number}_weight_stdev"
    writer.add_scalar(tag, torch.std(flattened_weights), n_iter)

    flattened_biases = biases.flatten()
    tag = f"layer_{layer_number}_biases_histogram"
    writer.add_histogram(tag, flattened_biases, n_iter, bins='tensorflow')

    tag = f"layer_{layer_number}_biases_min"
    writer.add_scalar(tag, torch.min(flattened_biases), n_iter)
    tag = f"layer_{layer_number}_biases_max"
    writer.add_scalar(tag, torch.max(flattened_biases), n_iter)
    tag = f"layer_{layer_number}_biases_mean"
    writer.add_scalar(tag, torch.mean(flattened_biases), n_iter)
    tag = f"layer_{layer_number}_biases_stdev"
    writer.add_scalar(tag, torch.std(flattened_biases), n_iter)

def weight_histograms(writer, n_iter, model):
    # iterate over all model layers
    for layer_number in range(len(model.layers)):
        
        # get layer
        layer = model.layers[layer_number]
        
        # compute weight histograms for appropriate layer
        if isinstance(layer, nn.Conv2d):
            weights = layer.weight
            biases = layer.bias
            weight_histograms_conv2d(writer, n_iter, weights, biases, layer_number)
            
        elif isinstance(layer, nn.Linear):
            weights = layer.weight
            biases = layer.bias
            weight_histograms_linear(writer, n_iter, weights, biases, layer_number)


In [7]:
# 4. Define test and training loops

eps = 1e-13

def train(epoch):
    model.train()

    # criterion = nn.CrossEntropyLoss()
    criterion = nn.NLLLoss(size_average=False)
    
    for batch_idx, (data, target) in enumerate(train_loader):
        if cuda:
            data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()
        output = model(data) # forward pass
        loss = criterion(torch.log(output+eps), target) # = sum_k(-t_k * log(y_k))
        loss.backward() # backward pass
        optimizer.step()

        if batch_idx % logging_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data)
            )

            # log train/loss to TensorBoard at every iteration
            n_iter = (epoch - 1) * len(train_loader) + batch_idx + 1
            writer.add_scalar('train/loss', loss.data, n_iter)
                    
    # visualize layer weights and biases
    weight_histograms(writer, n_iter, model)
    
    # log model parameters to TensorBoard at every epoch
    for name, param in model.named_parameters():
        layer, attr = os.path.splitext(name)
        attr = attr[1:]
        writer.add_histogram('{}/{}'.format(layer, attr), param.clone().cpu().data.numpy(), n_iter)

def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    
    # criterion = nn.CrossEntropyLoss()
    # criterion = nn.CrossEntropyLoss(size_average = False)
    criterion = nn.NLLLoss(size_average = False)

    for data, target in test_loader:
        if cuda:
            data, target = data.cuda(), target.cuda()

        output = model(data)

        test_loss += criterion(torch.log(output+eps), target,).item() # sum up batch loss (later, averaged over all test samples)
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset), test_accuracy)
    )

    # log test/loss and test/accuracy to TensorBoard at every epoch
    n_iter = epoch * len(train_loader)
    writer.add_scalar('test/loss', test_loss, n_iter)
    writer.add_scalar('test/accuracy', test_accuracy, n_iter)
    

In [8]:
# 5. Perform training over multiple epochs

# start training
for epoch in range(1, epochs + 1):
    train(epoch)
    test(epoch)

writer.close()





Test set: Average loss: 0.0816, Accuracy: 9742/10000 (97.42%)




Test set: Average loss: 0.0692, Accuracy: 9783/10000 (97.83%)


Test set: Average loss: 0.0581, Accuracy: 9817/10000 (98.17%)




Test set: Average loss: 0.0544, Accuracy: 9806/10000 (98.06%)


Test set: Average loss: 0.0534, Accuracy: 9835/10000 (98.35%)




Test set: Average loss: 0.0500, Accuracy: 9841/10000 (98.41%)




Test set: Average loss: 0.0552, Accuracy: 9831/10000 (98.31%)


Test set: Average loss: 0.0528, Accuracy: 9842/10000 (98.42%)




Test set: Average loss: 0.0476, Accuracy: 9858/10000 (98.58%)


Test set: Average loss: 0.0473, Accuracy: 9865/10000 (98.65%)



In [9]:
%tensorboard --logdir log/MNIST --port=8008
