<h1><b>MNIST with PyTorch</b></h1>

Start with common methods and classes that will be used in the subsequent experiments

<h2>Generic classes and functions</h2>

In [446]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision
from collections import OrderedDict
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import numpy as np

class NeuralNetwork(nn.Module):
    def __init__(self, sequence):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.nn_stack = nn.Sequential(sequence)

    def forward(self, x):
        x = self.flatten(x)
        return self.nn_stack(x)

# Stolen from Shweta's code
def one_hot(y):
    one_hot = torch.zeros([1, 10])
    one_hot[0][y] = 1
    return one_hot

def load_data(one_hot_transform = False, batch_size = 10):
    transformation = None
    if one_hot_transform:
        transformation = Lambda(lambda y: one_hot(y))
    
    training_data = datasets.MNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
        target_transform=transformation
    )

    test_data = datasets.MNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
        target_transform=transformation
    )

    train_dataloader = DataLoader(training_data, batch_size = batch_size)
    test_dataloader = DataLoader(test_data, batch_size = batch_size)

    return train_dataloader, test_dataloader

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using", device)

Using cpu


<h2>1 - Mean squared error loss function, sigmoid activation and 1 hidden layer<h2>

<strong>Network Summary</strong>

*   Input layer: 784 nodes
*   Hidden layer: 30 nodes
*   Output layer: 10 nodes
*   Activation between nodes: sigmoid
*   Output activation: sigmoid
*   Loss function: mean squared error
*   Stochastic gradient descent 10 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 95.3%
*   Testing set: 94.7%






In [440]:
def train(dataloader, model, loss_fn, optimizer):
    correct = 0
    size = len(dataloader.dataset)
    for X, y in dataloader.dataset:
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        correct += 1 if (pred.argmax(1) == y.argmax(1)) else 0
    correct /= size
    print(f"Training Accuracy: {(100*correct):>0.1f}%")

In [447]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    correct = 0
    with torch.no_grad():
        for X, y in dataloader.dataset:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += 1 if (pred.argmax(1) == y.argmax(1)) else 0
    correct /= size
    print(f"Test Accuracy: {(100*correct):>0.1f}%")

In [448]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('sigmoid1', nn.Sigmoid()),
            ('linear2', nn.Linear(30, 10)),
            ('sigmoid2', nn.Sigmoid())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = True, batch_size = 10)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.MSELoss()


In [449]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 86.9%
Test Accuracy: 92.2%

Epoch 2
-------
Training Accuracy: 93.3%
Test Accuracy: 93.6%

Epoch 3
-------
Training Accuracy: 94.3%
Test Accuracy: 94.2%

Epoch 4
-------
Training Accuracy: 94.9%
Test Accuracy: 94.5%

Epoch 5
-------
Training Accuracy: 95.3%
Test Accuracy: 94.7%


<h2>2 - Using cross entrophy loss function</h2>

*   Input layer: 784 nodes
*   Hidden layer: 30 nodes
*   Output layer: 10 nodes
*   Activation between nodes: sigmoid
*   Output activation: sigmoid
*   Loss function: cross-entropy
*   Stochastic gradient descent 10 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 93.7%
*   Testing set: 93.5%
*   Less acurate than using mean squared error but significant faster to compute


In [450]:
def train(dataloader, model, loss_fn, optimizer):
    correct = 0
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    correct /= size
    print(f"Training Accuracy: {(100*correct):>0.1f}%")

In [451]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    correct = 0
    with torch.no_grad():
        for  batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    correct /= size
    print(f"Test Accuracy: {(100*correct):>0.1f}%")

In [452]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('sigmoid1', nn.Sigmoid()),
            ('linear2', nn.Linear(30, 10)),
            ('sigmoid2', nn.Sigmoid())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 10)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [453]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 83.8%
Test Accuracy: 90.2%

Epoch 2
-------
Training Accuracy: 91.4%
Test Accuracy: 92.2%

Epoch 3
-------
Training Accuracy: 92.5%
Test Accuracy: 92.8%

Epoch 4
-------
Training Accuracy: 93.2%
Test Accuracy: 93.2%

Epoch 5
-------
Training Accuracy: 93.7%
Test Accuracy: 93.5%


<h2>3 - Cross entrophy loss function, ReLU activation</h2>

*   Input layer: 784 nodes
*   Hidden layer: 30 nodes
*   Output layer: 10 nodes
*   Activation between nodes: relu
*   Output activation: relu
*   Loss function: cross-entropy
*   Stochastic gradient descent 10 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 95.4%
*   Testing set: 94.9%
*   Slightly accurate than the previous sigmoid experiment
*   Results close to the first experiment using mean squared error


In [458]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(30, 10)),
            ('relu2', nn.ReLU())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 10)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [459]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 79.4%
Test Accuracy: 93.6%

Epoch 2
-------
Training Accuracy: 93.6%
Test Accuracy: 93.0%

Epoch 3
-------
Training Accuracy: 94.6%
Test Accuracy: 94.7%

Epoch 4
-------
Training Accuracy: 95.2%
Test Accuracy: 95.3%

Epoch 5
-------
Training Accuracy: 95.4%
Test Accuracy: 94.9%


<h2>4 - With batch size increased from 10 to 64</h2>

*   Input layer: 784 nodes
*   Hidden layer: 30 nodes
*   Output layer: 10 nodes
*   Activation between nodes: relu
*   Output activation: relu
*   Loss function: cross-entropy
*   Stochastic gradient descent 64 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 96.8%
*   Testing set: 96.0%
*   Both accuracy and performance increased


In [466]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(30, 10)),
            ('relu2', nn.ReLU())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [467]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 76.3%
Test Accuracy: 84.8%

Epoch 2
-------
Training Accuracy: 93.3%
Test Accuracy: 94.3%

Epoch 3
-------
Training Accuracy: 95.7%
Test Accuracy: 95.1%

Epoch 4
-------
Training Accuracy: 96.4%
Test Accuracy: 95.5%

Epoch 5
-------
Training Accuracy: 96.8%
Test Accuracy: 96.0%


<h2>5 - Add softmax to the output layer</h2>

*   Input layer: 784 nodes
*   Hidden layer: 30 nodes
*   Output layer: 10 nodes
*   Activation between nodes: relu
*   Output activation: softmax
*   Loss function: cross-entropy
*   Stochastic gradient descent 64 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 85.4%
*   Testing set: 85.1%
*   Accuracy dropped by 10%. Results are unstable (?)


In [478]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(30, 10)),
            ('softmax', nn.Softmax(dim=1))
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [479]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 68.6%
Test Accuracy: 82.0%

Epoch 2
-------
Training Accuracy: 83.1%
Test Accuracy: 83.6%

Epoch 3
-------
Training Accuracy: 84.3%
Test Accuracy: 84.2%

Epoch 4
-------
Training Accuracy: 84.9%
Test Accuracy: 84.8%

Epoch 5
-------
Training Accuracy: 85.4%
Test Accuracy: 85.1%


<h2>6 - Add more hidden layers</h2>

*   Input layer: 784 nodes
*   Hidden layer: 128 nodes
*   Hidden layer 2: 64 nodes
*   Output layer: 10 nodes
*   Activation between nodes: relu
*   Output activation: relu
*   Loss function: cross-entropy
*   Stochastic gradient descent 64 mini-batches
*   5 epochs, learning rate = 0.3

<strong>Results</strong>

*   Training set: 98.7%
*   Testing set: 97.7%
*   Best results


In [485]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 128)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(128, 64)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(64, 10)),
            ('relu3', nn.ReLU())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [486]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 80.2%
Test Accuracy: 83.8%

Epoch 2
-------
Training Accuracy: 94.4%
Test Accuracy: 95.9%

Epoch 3
-------
Training Accuracy: 97.6%
Test Accuracy: 97.0%

Epoch 4
-------
Training Accuracy: 98.3%
Test Accuracy: 97.5%

Epoch 5
-------
Training Accuracy: 98.7%
Test Accuracy: 97.7%


<h2>7 - Hypertuning / Regularization</h2>

*   Input layer: 784 nodes
*   Hidden layer: 128 nodes
*   Hidden layer 2: 64 nodes
*   Output layer: 10 nodes
*   Activation between nodes: relu
*   Output activation: relu
*   Loss function: cross-entropy
*   Stochastic gradient descent 64 mini-batches
*   5 epochs, learning rate = 0.3
*   Added dropout on the first and second layer with 20% or probability

<strong>Results</strong>

*   Training set: 96.9%
*   Testing set: 96.2%
*   Tried adding momentum and weight decay but had very poor results

In [505]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 128)),
            ('dropout1', nn.Dropout(0.2)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(128, 64)),
            ('dropout2', nn.Dropout(0.2)),            
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(64, 10)),
            ('relu3', nn.ReLU())
])

epochs = 5
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate) # weight_decay = 0.1, momentum = 0.5
loss_fn = nn.CrossEntropyLoss()

In [506]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 85.8%
Test Accuracy: 92.9%

Epoch 2
-------
Training Accuracy: 94.7%
Test Accuracy: 94.7%

Epoch 3
-------
Training Accuracy: 95.9%
Test Accuracy: 95.4%

Epoch 4
-------
Training Accuracy: 96.4%
Test Accuracy: 95.5%

Epoch 5
-------
Training Accuracy: 96.9%
Test Accuracy: 95.7%
