In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision
from collections import OrderedDict
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import numpy as np

class NeuralNetwork(nn.Module):
    def __init__(self, sequence):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.nn_stack = nn.Sequential(sequence)

    def forward(self, x):
        x = self.flatten(x)
        return self.nn_stack(x)

def one_hot(y):
    one_hot = torch.zeros([1, 10])
    one_hot[0][y] = 1
    return one_hot

def load_data(one_hot_transform = False, batch_size = 10):
    transformation = None
    if one_hot_transform:
        transformation = Lambda(lambda y: one_hot(y))
    
    training_data = datasets.MNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
        target_transform=transformation
    )

    test_data = datasets.MNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
        target_transform=transformation
    )

    train_dataloader = DataLoader(training_data, batch_size = batch_size)
    test_dataloader = DataLoader(test_data, batch_size = batch_size)

    return train_dataloader, test_dataloader

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using", device)

Using cpu


In [None]:
def train(dataloader, model, loss_fn, optimizer, show_info = True):
    correct = 0
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    correct /= size
    if show_info:
      print(f"Training Accuracy: {(100*correct):>0.1f}%")

    return correct

In [None]:
def test(dataloader, model, show_info = True):
    size = len(dataloader.dataset)
    correct = 0
    with torch.no_grad():
        for  batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    correct /= size
    if show_info:
      print(f"Test Accuracy: {(100*correct):>0.1f}%")

    return correct

<h2>Without hyperparameter tuning</h2>

In [None]:
# OrderedDict([('input', Linear(in_features=784, out_features=30, bias=True)), ('input_activation', ReLU()), ('linear0', Linear(in_features=30, out_features=30, bias=True)), ('relu0', ReLU()), ('output', Linear(in_features=30, out_features=10, bias=True))])

sequence = OrderedDict([
            ('linear1', nn.Linear(784, 30)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(30, 30)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(30, 10))
#            ('soft', nn.Softmax(dim=1))
])

epochs = 10
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 87.9%
Test Accuracy: 93.0%

Epoch 2
-------
Training Accuracy: 94.8%
Test Accuracy: 94.6%

Epoch 3
-------


KeyboardInterrupt: ignored

<h3>Conclusions</h3>

It seems the model is overfitting after epoch 27. The trainning accuracy increases but the testing accuracy starts to decrease.

<h2>L2 Regularization</h2>

In [None]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 128)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(128, 64)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(64, 10))
])

epochs = 50
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, weight_decay = 0.01) # weight_decay = 0.1, momentum = 0.5
loss_fn = nn.CrossEntropyLoss()

In [None]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 86.4%
Test Accuracy: 90.7%

Epoch 2
-------
Training Accuracy: 91.7%
Test Accuracy: 91.3%

Epoch 3
-------
Training Accuracy: 92.1%
Test Accuracy: 91.2%

Epoch 4
-------
Training Accuracy: 92.4%
Test Accuracy: 90.8%

Epoch 5
-------
Training Accuracy: 92.5%
Test Accuracy: 91.2%

Epoch 6
-------
Training Accuracy: 92.6%
Test Accuracy: 91.3%

Epoch 7
-------
Training Accuracy: 92.6%
Test Accuracy: 91.5%

Epoch 8
-------
Training Accuracy: 92.7%
Test Accuracy: 91.6%

Epoch 9
-------
Training Accuracy: 92.7%
Test Accuracy: 91.8%

Epoch 10
-------
Training Accuracy: 92.7%
Test Accuracy: 91.9%

Epoch 11
-------
Training Accuracy: 92.7%
Test Accuracy: 91.4%

Epoch 12
-------
Training Accuracy: 92.7%
Test Accuracy: 91.8%

Epoch 13
-------
Training Accuracy: 92.7%
Test Accuracy: 91.8%

Epoch 14
-------
Training Accuracy: 92.7%
Test Accuracy: 91.8%

Epoch 15
-------
Training Accuracy: 92.7%
Test Accuracy: 91.9%

Epoch 16
-------
Training Accuracy: 92.7%
Test A

<h2>Dropout in a deep neural network</h2>

In [None]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 128)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(128, 128)),
            ('dropout', nn.Dropout(0.2)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(128, 10))
])

epochs = 50
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = 0.6)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 90.9%
Test Accuracy: 94.9%

Epoch 2
-------
Training Accuracy: 96.1%
Test Accuracy: 95.3%

Epoch 3
-------
Training Accuracy: 97.0%
Test Accuracy: 96.0%

Epoch 4
-------
Training Accuracy: 97.6%
Test Accuracy: 96.6%

Epoch 5
-------
Training Accuracy: 97.8%
Test Accuracy: 96.1%

Epoch 6
-------
Training Accuracy: 98.1%
Test Accuracy: 96.5%

Epoch 7
-------
Training Accuracy: 98.3%
Test Accuracy: 96.1%

Epoch 8
-------
Training Accuracy: 98.5%
Test Accuracy: 97.0%

Epoch 9
-------
Training Accuracy: 98.4%
Test Accuracy: 97.2%

Epoch 10
-------
Training Accuracy: 98.7%
Test Accuracy: 97.3%

Epoch 11
-------
Training Accuracy: 98.7%
Test Accuracy: 97.0%

Epoch 12
-------
Training Accuracy: 98.8%
Test Accuracy: 96.5%

Epoch 13
-------
Training Accuracy: 98.8%
Test Accuracy: 97.5%

Epoch 14
-------
Training Accuracy: 99.0%
Test Accuracy: 96.8%

Epoch 15
-------
Training Accuracy: 98.9%
Test Accuracy: 97.0%

Epoch 16
-------
Training Accuracy: 98.9%
Test A

<h1>Deeper Neural Network with Dropout</h1>

In [None]:
sequence = OrderedDict([
            ('linear1', nn.Linear(784, 256)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(256, 256)),
            ('dropout2', nn.Dropout(0.8)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(256, 128)),
            ('dropout3', nn.Dropout(0.8)),
            ('relu3', nn.ReLU()),
            ('linear4', nn.Linear(128, 128)),
            ('dropout4', nn.Dropout(0.8)),
            ('relu4', nn.ReLU()),
            ('linear5', nn.Linear(128, 64)),
            ('dropout5', nn.Dropout(0.8)),
            ('relu5', nn.ReLU()),
            ('linear6', nn.Linear(64, 64)),
            ('dropout6', nn.Dropout(0.8)),
            ('relu6', nn.ReLU()),
            ('linear7', nn.Linear(64, 32)),
            ('dropout7', nn.Dropout(0.8)),
            ('relu7', nn.ReLU()),
            ('linear8', nn.Linear(32, 32)),
            ('dropout8', nn.Dropout(0.8)),
            ('relu8', nn.ReLU()),
            ('linear9', nn.Linear(32, 10)),
            ('relu9', nn.ReLU()),

])

epochs = 10
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)
model = NeuralNetwork(sequence).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = 0.6)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for t in range(epochs):
    print(f"\nEpoch {t+1}")
    print("-------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)


Epoch 1
-------
Training Accuracy: 11.0%
Test Accuracy: 9.8%

Epoch 2
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 3
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 4
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 5
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 6
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 7
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 8
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 9
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%

Epoch 10
-------
Training Accuracy: 9.9%
Test Accuracy: 9.8%


<h1>Deeper Neural Network Experiments</h1>
This experiment increases the number of layers using a different number of nodes. The code try the following combination:


*   From 1 to 10 hidden layers
*   Using 10, 30, 60, 100 and 200 nodes



In [None]:
def build_sequence(hidden_layers, nodes):
  # input
  sequence = OrderedDict([('input', nn.Linear(784, nodes)),
                          ('input_activation', nn.ReLU())])
  for i in range(hidden_layers):
    sequence['linear' + str(i)] = nn.Linear(nodes, nodes)
    sequence['relu' + str(i)] = nn.ReLU()

  # output
  sequence['output'] = nn.Linear(nodes, 10)

  return sequence

OrderedDict([('input', Linear(in_features=784, out_features=10, bias=True)),
             ('input_activation', ReLU()),
             ('linear0', Linear(in_features=10, out_features=10, bias=True)),
             ('relu0', ReLU()),
             ('linear1', Linear(in_features=10, out_features=10, bias=True)),
             ('relu1', ReLU()),
             ('output', Linear(in_features=10, out_features=10, bias=True))])

In [None]:
epochs = 10
learning_rate = 0.3
train_dataloader, test_dataloader = load_data(one_hot_transform = False, batch_size = 64)

In [None]:
results = []
result = {}
train_accuracy = 0
test_accuracy = 0
for n in [10, 30, 60, 100, 200]:
  for x in range(10):
    sequence = build_sequence(hidden_layers = x + 1, nodes = n)
    print(sequence)
    model = NeuralNetwork(sequence).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
    loss_fn = nn.CrossEntropyLoss()
    for t in range(epochs):
        train_accuracy = train(train_dataloader, model, loss_fn, optimizer, False)
        test_accuracy = test(test_dataloader, model, False)
    result = {'hidden_layers': x + 1, 'nodes': n, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}
    print(result)
    results.append(result)

OrderedDict([('input', Linear(in_features=784, out_features=10, bias=True)), ('input_activation', ReLU()), ('linear0', Linear(in_features=10, out_features=10, bias=True)), ('relu0', ReLU()), ('output', Linear(in_features=10, out_features=10, bias=True))])
{'hidden_layers': 1, 'nodes': 10, 'train_accuracy': 0.9396833333333333, 'test_accuracy': 0.9281}
OrderedDict([('input', Linear(in_features=784, out_features=10, bias=True)), ('input_activation', ReLU()), ('linear0', Linear(in_features=10, out_features=10, bias=True)), ('relu0', ReLU()), ('linear1', Linear(in_features=10, out_features=10, bias=True)), ('relu1', ReLU()), ('output', Linear(in_features=10, out_features=10, bias=True))])
{'hidden_layers': 2, 'nodes': 10, 'train_accuracy': 0.9383, 'test_accuracy': 0.9277}
OrderedDict([('input', Linear(in_features=784, out_features=10, bias=True)), ('input_activation', ReLU()), ('linear0', Linear(in_features=10, out_features=10, bias=True)), ('relu0', ReLU()), ('linear1', Linear(in_features=