# 6. Softmax Regression as a Neural Network

In [1]:
import time
import math
import random

import numpy as np
import pandas as pd
from IPython import display
import matplotlib.pyplot as plt

import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms

## Helper Functions

In [29]:
class Accumulator: 

    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

## Import Data

In [30]:
def get_fashion_mnist_labels(labels):
    '''
    Function to obtain the text label for each data point.
    '''
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

In [31]:
def load_data(batch_size, resize=None):
    '''
    Function to create a data iterator.
    '''
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root='../data', train=True, transform=trans, download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root='../data', train=False, transform=trans, download=True)
    return (data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4),
            data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=4))

In [32]:
batch_size = 256
train_iter, test_iter = load_data(batch_size)

## Model

In [33]:
net = nn.Sequential(nn.Flatten(),             #flatten the image matrices
                    nn.Linear(28*28, 10)      #fully connected layer
                   )

## Initialize Parameters

In [34]:
def init_weights(net):
    if type(net) == nn.Linear:
        nn.init.normal_(net.weight, std=0.01)

In [35]:
net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=10, bias=True)
)

## Loss Function

The **softmax function** is given as:

$$\hat y_j = \frac{\exp(o_j)}{\sum_k \exp(o_k)}$$

To prevent potential **overflow** caused by the exponential term, we can **subtract $\max(o_k)$ from $o_k$** before computing softmax:

$$
\begin{aligned}
\hat y_j & =  \frac{\exp(o_j - \max(o_k))\exp(\max(o_k))}{\sum_k \exp(o_k - \max(o_k))\exp(\max(o_k))} \\
& = \frac{\exp(o_j - \max(o_k))}{\sum_k \exp(o_k - \max(o_k))}.
\end{aligned}
$$

which is equivalent to the original softmax function.

Now, to prevent potential **underflow** caused by the term $\log(\hat y_j)$ where $\hat y_j=\exp(o_j - \max(o_k))$, we can **avoid such calculation** by obtaining:

$$
\begin{aligned}
\log{(\hat y_j)} & = \log\left( \frac{\exp(o_j - \max(o_k))}{\sum_k \exp(o_k - \max(o_k))}\right) \\
& = \log{(\exp(o_j - \max(o_k)))}-\log{\left( \sum_k \exp(o_k - \max(o_k)) \right)} \\
& = o_j - \max(o_k) -\log{\left( \sum_k \exp(o_k - \max(o_k)) \right)}.
\end{aligned}
$$

By doing this, we can keep the **softmax function** to output probabilities while using the **original linear outputs** when calculating the the loss function.

In [36]:
loss = nn.CrossEntropyLoss(reduction='none')

## Optimizer

In [37]:
trainer = torch.optim.SGD(net.parameters(), lr=0.1)

## Evaluation

In [41]:
def accuracy(y_hat, y):
    '''
    Count the number of correctly predicted samples.
    '''
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1) 
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())

In [42]:
def evaluate_accuracy(net, data_iter):
    '''
    Cumulative evaluation over multiple mini-batches.
    '''
    if isinstance(net, torch.nn.Module):
        net.eval()  #set the model to evaluation mode
    metric = Accumulator(2)  #number of correct predictions and total predictions
    with torch.no_grad():
        for X, y in data_iter:
            metric.add(accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

## Training

In [38]:
def train_epoch(net, train_iter, loss, updater):
    '''
    Training for a single epoch.
    '''
    if isinstance(net, torch.nn.Module):
        net.train()   
    metric = Accumulator(3)      #training loss, training accuracy, sample size
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y)
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.mean().backward()
            updater.step()
        else:
            l.sum().backward()
            updater(X.shape[0])
        metric.add(float(l.sum()), accuracy(y_hat,y), y.numel())
    return metric[0]/metric[2], metric[1]/metric[2]

In [39]:
def train(net, train_iter, test_iter, loss, num_epochs, updater):
    for epoch in range(num_epochs):
        train_metrics = train_epoch(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        print(f'Epoch {epoch+1}')
        print(f'Training loss: {train_metrics[0]}')
        print(f'Training accuracy: {train_metrics[1]}')
        print(f'Test accuracy: {test_acc}')
    train_loss, train_acc = train_metrics
    assert train_loss < 0.5, train_loss
    assert train_acc <= 1 and train_acc > 0.7, train_acc
    assert test_acc <= 1 and test_acc > 0.7, test_acc

In [43]:
num_epochs = 10
train(net, train_iter, test_iter, loss, num_epochs, trainer)

Epoch 1
Training loss: 0.7777179353713989
Training accuracy: 0.7530833333333333
Test accuracy: 0.7947
Epoch 2
Training loss: 0.5702055841445923
Training accuracy: 0.813
Test accuracy: 0.8085
Epoch 3
Training loss: 0.5256161816279094
Training accuracy: 0.8250333333333333
Test accuracy: 0.8145
Epoch 4
Training loss: 0.5008067871729532
Training accuracy: 0.83285
Test accuracy: 0.8271
Epoch 5
Training loss: 0.48495729923248293
Training accuracy: 0.8365
Test accuracy: 0.8253
Epoch 6
Training loss: 0.4740870672225952
Training accuracy: 0.8408166666666667
Test accuracy: 0.8221
Epoch 7
Training loss: 0.46608039538065593
Training accuracy: 0.8425666666666667
Test accuracy: 0.8224
Epoch 8
Training loss: 0.45856751906077065
Training accuracy: 0.8443333333333334
Test accuracy: 0.8303
Epoch 9
Training loss: 0.45250986830393475
Training accuracy: 0.8466833333333333
Test accuracy: 0.8226
Epoch 10
Training loss: 0.447976597849528
Training accuracy: 0.8468833333333333
Test accuracy: 0.8303
