<a href="https://colab.research.google.com/github/scaomath/wustl-math450/blob/main/Lectures/Math_450_Notebook_6_(GD).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
### somehow torch.dataset malfunctioned after an update in March 2021
### Facebook team issued a hotfix but apparently not loaded in the docker image
### of Colab yet as of Mar 5, 2021

# Coding lecture 6 of Math 450

## Last two weeks

- Explore MNIST dataset.
- Generator, iterator, `iter()`, `next()`, `enumerate()`, `try: except:` flow control.
- Matrix-vector multiplications and "broadcastability".
- `loss.backward()` vs hand computation.

## Today
- Why `with torch.no_grad():` is necessary.
- Build simple neural network using `torch.nn.Sequential()`
- Gradient descent for a binary classification problem.
- (if time allows) Class and object-oriented programming primer. `constructor`, inheritance, `super`.

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

import torch
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# MNIST
- Load the data. Both train and validation (test) data.
- Extract only 0, 1 labeled data.
- Write a loss function compute the cross entropy.
- Apply the gradient descent.

In [None]:
# load the mnist data
train = datasets.MNIST(root='./data', 
                       train=True, 
                       download=True, 
                       transform = transforms.ToTensor())
valid = datasets.MNIST(root='./data', 
                       train=False, 
                       download=True, 
                       transform = transforms.ToTensor())

In [None]:
idx_tr = (train.targets == 0) | (train.targets == 1) # getting 0 and 1 labeled data
target_tr = train.targets[idx_tr]
train_new = train.train_data[idx_tr].clone()/255.0

In [None]:
idx_valid = (valid.targets == 0) | (valid.targets == 1) # getting 0 and 1 labeled data
target_val = valid.targets[idx_valid]
valid_new = valid.test_data[idx_valid].clone()/255.0

In [None]:
print(len(train_new), len(valid_new))

In [None]:
fig, axes = plt.subplots(4,5, figsize=(12,10))
axes = axes.flatten()
indices = np.random.randint(0,len(train_new),size=20)
for i, idx in enumerate(indices):
    X = train_new[idx].clone().detach()/255
    y = target_tr[idx]
    axes[i].imshow(X, cmap='gray')
    axes[i].axis('off') # hide the axes ticks
    axes[i].set_title(str(int(y)), color= 'black', fontsize=25)
plt.show()

In [None]:
# cross-entropy loss

def cross_entropy_loss(yhat, y):
    '''
    Compute the cross entropy of yhat against y
      - yhat: the sigmoid of the output of an NN
      - y: 0 or 1, true target
    '''
    loss = - y * torch.log(yhat) - (1-y) * torch.log(1-yhat)
    return loss.mean()

def sigmoid(yhat):
    return 1/(1 + torch.exp(-yhat))

In [None]:
# explain why the implementation above is naturally vectorized
# randomly generate yhat

# the dimension has to be consistent for yhat and y

yhat = torch.randn((10,))
yhat = sigmoid(yhat)
y = target_tr[:10]

In [None]:
- y * torch.log(yhat) - (1-y) * torch.log(1-yhat) 
# 10 samples, a cross-entropy loss for each sample

In [None]:
## this would yield bad results
yhat = torch.randn((5,1))
yhat = sigmoid(yhat)
y = target_tr[:5]

# `nn.Sequential()`

- An NN container for LEGO'ing layers.
- Good for beginners like us.

In [None]:
import torch.nn as nn
from torchsummary import summary

In [None]:
model = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

In [None]:
# demonstrate summary

In [None]:
# class implementation which we will cover in next class
class MLP(nn.Module):
    def __init__(self):
        # super is a keyword for 
        # constructor inheritance
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        # train data (-1, 28, 28) --> (-1, 28*28)
        # in the implementation above
        x = x.view(x.size(0), -1)
        x = self.layers(x)
        return x

# Gradient descent
For $k=0,1,\dots$, update $W_{k+1} = W_k - \alpha \nabla_W L$, where $W$ stands for the parameters of the NN.

In [None]:
# hyper-parameters, data preparation,  initialize the model

numEpochs = 20 # number of epochs, 1 epoch means the model sweeps train data set once
learning_rate = 1e-2 

model = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )

X = train_new.view(train_new.size(0), -1)
y = target_tr
print(X.size(), y.size())

In [None]:
### cpu code

for i, epoch in enumerate(range(numEpochs)):

    y_hat = model(X)
    y_hat = sigmoid(y_hat)

    loss = cross_entropy_loss(y_hat, y)
    
    print(f"cross entropy loss after {i}", 
          f"iterations is {loss.item()}",)
    
    # accuracy
    preds = (y_hat > 0.5).detach()
    acc = (preds == y).float().mean()
    print(f"accuracy: {100*acc:.2f} \n")

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # autograd to do backprop
    loss.backward()

    # GD
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad


In [None]:
# preparation for GPU code

In [None]:
# GPu training