# 9 - Deep Learning

We will implement a (shallow) neural network i.e. logistic regression in pytorch.

Tutorial [optional]: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

In [None]:
#@title Run this cell to download preprocessed data (features + labels). { display-mode: "form" }
!pip install -U wget
!rm -rf preprocessed
!mkdir preprocessed

import wget
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/preprocessed/data.npz', 'preprocessed/data.npz')

In [None]:
!mkdir -p checkpoint

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics, exceptions
import os, random

In [None]:
#@title Run this cell to run preprocessing. { display-mode: "form" }
#@markdown - `impute_missing_values(X)`
#@markdown - `normalize_feature_matrix(X)`

with np.load('preprocessed/data.npz') as f:
    X = f['X']
    y = f['y']
    feature_names = f['feature_names']

def impute_missing_values(X):
    """
    For each feature column, impute missing values  (np.nan) with the 
    population mean for that feature.
    
    Args:
        X: np.array, shape (N, d). X could contain missing values
    Returns:
        X: np.array, shape (N, d). X does not contain any missing values
    """
    from sklearn.impute import SimpleImputer
    return SimpleImputer().fit_transform(X)

def normalize_feature_matrix(X):
    """
    For each feature column, normalize all values to range [0, 1].

    Args:
        X: np.array, shape (N, d).
    Returns:
        X: np.array, shape (N, d). Values are normalized per column.
    """
    from sklearn.preprocessing import MinMaxScaler
    return MinMaxScaler().fit_transform(X)

X = impute_missing_values(X)
X = normalize_feature_matrix(X)

In [None]:
X.shape, y.shape

In [None]:
N, d = X.shape

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64

torch.random.manual_seed(0)
np.random.seed(0)
random.seed(0)

n_epochs = 500
learning_rate = 1e-3

## (A) Data loading & batching

- Reference: **Section 1** of https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#loading-and-normalizing-cifar10
- Documentation: `Dataset` and `DataLoader` classes at https://pytorch.org/docs/stable/data.html


TODOs:
1. Create a subclass of `torch.utils.data.Dataset`, which is a wrapper for our dataset. 
> The subclass should override `__len__`, that provides the size of the dataset, and `__getitem__`, supporting integer indexing in range from `0` to `len(self)` exclusive.
2. Create a `Dataset` instance for each of train/val/test sets. 
3. Use the dataset to create a `DataLoader`. 
> The data loader provides an "iterator" that allows us to iterate through the dataset in batches, and also handles shuffling at the end of each epoch. 

In [None]:
# Split data into train (80%), validation (10%) and test (10%)
from sklearn.model_selection import train_test_split
Xtr, X__, ytr, y__ = train_test_split(X,   y,   train_size=0.7, stratify=y,   random_state=0)
Xva, Xte, yva, yte = train_test_split(X__, y__, test_size=0.5, stratify=y__, random_state=0)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, X, y):
        pass
    
    def __getitem__(self, idx):
        pass
    
    def __len__(self):
        pass

In [None]:
# Define datasets and data loaders for training, validation, and test set
batch_size = 64

# Shuffle for train set
tr = SimpleDataset(Xtr, ytr)
tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True)

# Do not shuffle for validation set
va = 
va_loader = 

# Do not shuffle for test set
te = 
te_loader = 

## (B) Architecture definition

- Reference: **Section 2** of https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#define-a-convolutional-neural-network
- Documentation: `nn.Module` at https://pytorch.org/docs/stable/nn.html

TODOs:
- Create a single layer neural network with sigmoid activation. 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc = 

    def forward(self, x):
        z = 
        return z

In [None]:
model = LogisticRegression(d)

In [None]:
#@title How many learnable parameters does our model have? { display-mode: "form" }
answer = 0 #@param {type:"raw"}

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('Number of learnable parameters:', count_parameters(model))

## (C) Define training loop

- Reference: **Section 3** of https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#train-the-network

In [None]:
# Define loss function and optimizer
model = model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def _train_epoch(data_loader, model, criterion, optimizer):
    """
    Train the `model` for one epoch of data from `data_loader`
    Use `optimizer` to optimize the specified `criterion`
    """
    model.train()
    for i, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)
        
        # clear parameter gradients
        ...

        # forward + backward + optimize
        ...

## (D) Train the network

In [None]:
#@title Some helper functions... { display-mode: "form" }
def _evaluate_epoch(tr_loader, va_loader, model, criterion):
    model.eval()
    with torch.no_grad():
        # Evaluate on train
        y_true, y_score = [], []
        running_loss = []
        for X, y in tr_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            y_true.append(y.cpu().numpy())
            y_score.append(output.cpu().numpy())
            running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        train_loss = np.mean(running_loss)
        train_score = metrics.roc_auc_score(y_true, y_score)
        print('tr loss', train_loss, 'tr AUROC', train_score)

        # Evaluate on validation
        y_true, y_score = [], []
        running_loss = []
        for X, y in va_loader:
            X, y = X.to(device), y.to(device)
            with torch.no_grad():
                output = model(X)
                y_true.append(y.cpu().numpy())
                y_score.append(output.cpu().numpy())
                running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        val_loss = np.mean(running_loss)
        val_score = metrics.roc_auc_score(y_true, y_score)
        print('va loss', val_loss, 'va AUROC', val_score)
    return train_loss, val_loss, train_score, val_score

def save_checkpoint(model, epoch, checkpoint_dir):
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
    }

    filename = os.path.join(checkpoint_dir, 'epoch={}.checkpoint.pth.tar'.format(epoch))
    torch.save(state, filename)

In [None]:
# Re-initialize model, loss function and optimizer
model = LogisticRegression(d)
model = model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
outputs = []
print('Epoch', 0)
out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
outputs.append(out)

for epoch in range(0, n_epochs):
    print('Epoch', epoch+1)
    # Train model
    _train_epoch(tr_loader, model, criterion, optimizer)

    # Evaluate model
    out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
    outputs.append(out)

    # Save model parameters
    save_checkpoint(model, epoch+1, 'checkpoint/')

In [None]:
train_losses, val_losses, train_scores, val_scores = zip(*outputs)

fig, ax = plt.subplots(figsize=(5,5))
plt.plot(range(n_epochs + 1), train_scores, '--o', label='Train')
plt.plot(range(n_epochs + 1), val_scores, '--o', label='Validation')
plt.xlabel('epoch')
plt.ylabel('AUROC')
plt.legend()
plt.savefig('auroc.png', dpi=300)

fig, ax = plt.subplots(figsize=(5,5))
plt.plot(range(n_epochs + 1), train_losses, '--o', label='Train')
plt.plot(range(n_epochs + 1), val_losses, '--o', label='Validation')
plt.xlabel('epoch')
plt.ylabel('Loss (binary cross entropy)')
plt.legend()
plt.savefig('loss.png', dpi=300)