# Problem 3

Use this notebook to write your code for problem 3.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 3D - Convolutional network

As in problem 2, we have conveniently provided for your use code that loads and preprocesses the MNIST data.

In [None]:
# load MNIST data into PyTorch format
import torch
import torchvision
import torchvision.transforms as transforms

# set batch size
batch_size = 32

# load training data downloaded into data/ folder
mnist_training_data = torchvision.datasets.MNIST('data/', train=True, download=True,
                                                transform=transforms.ToTensor())
# transforms.ToTensor() converts batch of images to 4-D tensor and normalizes 0-255 to 0-1.0
training_data_loader = torch.utils.data.DataLoader(mnist_training_data,
                                                  batch_size=batch_size,
                                                  shuffle=True)



# load test data
mnist_test_data = torchvision.datasets.MNIST('data/', train=False, download=True,
                                                transform=transforms.ToTensor())
test_data_loader = torch.utils.data.DataLoader(mnist_test_data,
                                                  batch_size=batch_size,
                                                  shuffle=False)

<torch.utils.data.dataloader.DataLoader object at 0x7b0da8d5b3d0>


In [None]:
# look at the number of batches per epoch for training and validation
print(f'{len(training_data_loader)} training batches')
print(f'{len(training_data_loader) * batch_size} training samples')
print(f'{len(test_data_loader)} validation batches')

1875 training batches
60000 training samples
313 validation batches


In [None]:
# sample model
import torch.nn as nn

model = nn.Sequential(
    nn.Conv2d(1, 8, kernel_size=(3,3)),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=0.5),

    nn.Conv2d(8, 8, kernel_size=(3,3)),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=0.5),

    nn.Flatten(),
    nn.Linear(25*8, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
    # PyTorch implementation of cross-entropy loss includes softmax layer
)

In [None]:
# why don't we take a look at the shape of the weights for each layer
for p in model.parameters():
    print(p.data.shape)

torch.Size([8, 1, 3, 3])
torch.Size([8])
torch.Size([8, 8, 3, 3])
torch.Size([8])
torch.Size([64, 200])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])


In [None]:
# our model has some # of parameters:
count = 0
for p in model.parameters():
    n_params = np.prod(list(p.data.shape)).item()
    count += n_params
print(f'total params: {count}')

total params: 14178


In [None]:
# For a multi-class classification problem
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters())

In [None]:
# Train the model for 10 epochs, iterating on the data in batches
n_epochs = 10

# store metrics
training_accuracy_history = np.zeros([n_epochs, 1])
training_loss_history = np.zeros([n_epochs, 1])
validation_accuracy_history = np.zeros([n_epochs, 1])
validation_loss_history = np.zeros([n_epochs, 1])

for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/10:', end='')
    train_total = 0
    train_correct = 0
    # train
    model.train()
    for i, data in enumerate(training_data_loader):
        images, labels = data
        optimizer.zero_grad()
        # forward pass
        output = model(images)
        # calculate categorical cross entropy loss
        loss = criterion(output, labels)
        # backward pass
        loss.backward()
        optimizer.step()

        # track training accuracy
        _, predicted = torch.max(output.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()
        # progress update after 180 batches (~1/10 epoch for batch size 32)
        if i % 180 == 0: print('.',end='')
    training_loss_history[epoch] /= len(training_data_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

    # validate
    test_total = 0
    test_correct = 0
    with torch.no_grad():
        model.eval()
        for i, data in enumerate(test_data_loader):
            images, labels = data
            # forward pass
            output = model(images)
            # find accuracy
            _, predicted = torch.max(output.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
            # find loss
            loss = criterion(output, labels)
            validation_loss_history[epoch] += loss.item()
        validation_loss_history[epoch] /= len(test_data_loader)
        validation_accuracy_history[epoch] = test_correct / test_total
    print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')

Epoch 1/10:...........
	loss: 0.6733, acc: 0.7771, val loss: 0.2138, val acc: 0.9384
Epoch 2/10:...........
	loss: 0.4324, acc: 0.8639, val loss: 0.1982, val acc: 0.9456
Epoch 3/10:...........
	loss: 0.4195, acc: 0.8689, val loss: 0.1896, val acc: 0.9473
Epoch 4/10:...........
	loss: 0.4075, acc: 0.8721, val loss: 0.1888, val acc: 0.9479
Epoch 5/10:...........
	loss: 0.3982, acc: 0.8748, val loss: 0.1824, val acc: 0.9503
Epoch 6/10:...........
	loss: 0.3965, acc: 0.8769, val loss: 0.1990, val acc: 0.9485
Epoch 7/10:...........
	loss: 0.3964, acc: 0.8765, val loss: 0.1912, val acc: 0.9474
Epoch 8/10:...........
	loss: 0.3958, acc: 0.8769, val loss: 0.1643, val acc: 0.9541
Epoch 9/10:...........
	loss: 0.3920, acc: 0.8793, val loss: 0.1727, val acc: 0.9494
Epoch 10/10:...........
	loss: 0.3896, acc: 0.8809, val loss: 0.1970, val acc: 0.9458


Above, we output the training loss/accuracy as well as the validation loss and accuracy. Not bad! Let's see if you can do better.

## **PART G**

In [None]:
import torch.nn as nn

model = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=32),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=0.1),

    nn.Conv2d(32, 16, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=16),
    nn.ReLU(),
    nn.Dropout(p=0.1),

    nn.Conv2d(16, 8, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=8),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=0.1),

    nn.Flatten(),
    nn.Linear(128, 64),
    nn.BatchNorm1d(num_features=64),
    nn.ReLU(),
    nn.Linear(64, 10)
    # PyTorch implementation of cross-entropy loss includes softmax layer
)
print(model)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): Dropout(p=0.1, inplace=False)
  (5): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1))
  (6): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): ReLU()
  (8): Dropout(p=0.1, inplace=False)
  (9): Conv2d(16, 8, kernel_size=(3, 3), stride=(1, 1))
  (10): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): ReLU()
  (12): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (13): Dropout(p=0.1, inplace=False)
  (14): Flatten(start_dim=1, end_dim=-1)
  (15): Linear(in_features=128, out_features=64, bias=True)
  (16): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (17): ReLU()
  (18): Linear(in_features=64, out

In [None]:
# why don't we take a look at the shape of the weights for each layer
for p in model.parameters():
    print(p.data.shape)

torch.Size([32, 1, 3, 3])
torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([16, 32, 3, 3])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([8, 16, 3, 3])
torch.Size([8])
torch.Size([8])
torch.Size([8])
torch.Size([64, 128])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([10, 64])
torch.Size([10])


In [None]:
count = 0
for p in model.parameters():
    n_params = np.prod(list(p.data.shape)).item()
    count += n_params
print(f'total params: {count}')

total params: 15250


In [None]:
# For a multi-class classification problem
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters())

In [None]:
# Train the model for 10 epochs, iterating on the data in batches
n_epochs = 10

# store metrics
training_accuracy_history = np.zeros([n_epochs, 1])
training_loss_history = np.zeros([n_epochs, 1])
validation_accuracy_history = np.zeros([n_epochs, 1])
validation_loss_history = np.zeros([n_epochs, 1])

for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/10:', end='')
    train_total = 0
    train_correct = 0
    # train
    model.train()
    for i, data in enumerate(training_data_loader):
        images, labels = data
        optimizer.zero_grad()
        # forward pass
        output = model(images)
        # calculate categorical cross entropy loss
        loss = criterion(output, labels)
        # backward pass
        loss.backward()
        optimizer.step()

        # track training accuracy
        _, predicted = torch.max(output.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()
        # progress update after 180 batches (~1/10 epoch for batch size 32)
        if i % 180 == 0: print('.',end='')
    training_loss_history[epoch] /= len(training_data_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

    # validate
    test_total = 0
    test_correct = 0
    with torch.no_grad():
        model.eval()
        for i, data in enumerate(test_data_loader):
            images, labels = data
            # forward pass
            output = model(images)
            # find accuracy
            _, predicted = torch.max(output.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
            # find loss
            loss = criterion(output, labels)
            validation_loss_history[epoch] += loss.item()
        validation_loss_history[epoch] /= len(test_data_loader)
        validation_accuracy_history[epoch] = test_correct / test_total
    print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')

Epoch 1/10:...........
	loss: 0.0840, acc: 0.9734, val loss: 0.0431, val acc: 0.9857
Epoch 2/10:...........
	loss: 0.0728, acc: 0.9777, val loss: 0.0347, val acc: 0.9886
Epoch 3/10:...........
	loss: 0.0636, acc: 0.9806, val loss: 0.0279, val acc: 0.9897
Epoch 4/10:...........
	loss: 0.0579, acc: 0.9815, val loss: 0.0386, val acc: 0.9882
Epoch 5/10:...........
	loss: 0.0566, acc: 0.9820, val loss: 0.0368, val acc: 0.9871
Epoch 6/10:...........
	loss: 0.0543, acc: 0.9833, val loss: 0.0260, val acc: 0.9917
Epoch 7/10:...........
	loss: 0.0505, acc: 0.9848, val loss: 0.0342, val acc: 0.9898
Epoch 8/10:...........
	loss: 0.0517, acc: 0.9843, val loss: 0.0310, val acc: 0.9915
Epoch 9/10:...........
	loss: 0.0511, acc: 0.9838, val loss: 0.0253, val acc: 0.9918
Epoch 10/10:...........
	loss: 0.0471, acc: 0.9859, val loss: 0.0252, val acc: 0.9916


## Now that we have a working model, we will test out different Dropout parameters from 0.1 to 1

In [None]:
dropout_ps = np.linspace(0.1, 1, 10)

for p in dropout_ps:
  model = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=32),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=p),

    nn.Conv2d(32, 16, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=16),
    nn.ReLU(),
    nn.Dropout(p=p),

    nn.Conv2d(16, 8, kernel_size=(3,3)),
    nn.BatchNorm2d(num_features=8),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Dropout(p=p),

    nn.Flatten(),
    nn.Linear(128, 64),
    nn.BatchNorm1d(num_features=64),
    nn.ReLU(),
    nn.Linear(64, 10)
    # PyTorch implementation of cross-entropy loss includes softmax layer
  )

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.RMSprop(model.parameters())

  # Train the model for 1 epoch, iterating on the data in batches
  n_epochs = 1

  # store metrics
  training_accuracy_history = np.zeros([n_epochs, 1])
  training_loss_history = np.zeros([n_epochs, 1])
  validation_accuracy_history = np.zeros([n_epochs, 1])
  validation_loss_history = np.zeros([n_epochs, 1])

  print(f'p = {p:.1f}:')
  for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/1:', end='')
    train_total = 0
    train_correct = 0
    # train
    model.train()
    for i, data in enumerate(training_data_loader):
        images, labels = data
        optimizer.zero_grad()
        # forward pass
        output = model(images)
        # calculate categorical cross entropy loss
        loss = criterion(output, labels)
        # backward pass
        loss.backward()
        optimizer.step()

        # track training accuracy
        _, predicted = torch.max(output.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()
        # progress update after 180 batches (~1/10 epoch for batch size 32)
        if i % 180 == 0: print('.',end='')
    training_loss_history[epoch] /= len(training_data_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

    # validate
    test_total = 0
    test_correct = 0
    with torch.no_grad():
        model.eval()
        for i, data in enumerate(test_data_loader):
            images, labels = data
            # forward pass
            output = model(images)
            # find accuracy
            _, predicted = torch.max(output.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
            # find loss
            loss = criterion(output, labels)
            validation_loss_history[epoch] += loss.item()
        validation_loss_history[epoch] /= len(test_data_loader)
        validation_accuracy_history[epoch] = test_correct / test_total
    print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')


p = 0.1:
Epoch 1/1:...........
	loss: 0.1630, acc: 0.9486, val loss: 0.0477, val acc: 0.9842
p = 0.2:
Epoch 1/1:...........
	loss: 0.1925, acc: 0.9394, val loss: 0.0528, val acc: 0.9840
p = 0.30000000000000004:
Epoch 1/1:...........
	loss: 0.2515, acc: 0.9193, val loss: 0.0610, val acc: 0.9810
p = 0.4:
Epoch 1/1:...........
	loss: 0.3138, acc: 0.8993, val loss: 0.0779, val acc: 0.9762
p = 0.5:
Epoch 1/1:...........
	loss: 0.4329, acc: 0.8589, val loss: 0.0858, val acc: 0.9728
p = 0.6:
Epoch 1/1:...........
	loss: 0.5757, acc: 0.8116, val loss: 0.1459, val acc: 0.9569
p = 0.7000000000000001:
Epoch 1/1:...........
	loss: 0.8541, acc: 0.7152, val loss: 0.2156, val acc: 0.9414
p = 0.8:
Epoch 1/1:...........
	loss: 1.1120, acc: 0.6193, val loss: 0.5164, val acc: 0.8697
p = 0.9:
Epoch 1/1:...........
	loss: 1.6029, acc: 0.4387, val loss: 0.9492, val acc: 0.7899
p = 1.0:
Epoch 1/1:...........
	loss: 2.3027, acc: 0.1105, val loss: 6159115.8291, val acc: 0.0975
