# Develop notebook
Develop notebook for issue with staled training in scratch networks for Udacity [Deep Learning nanodegree](https://udacity.com/course/deep-learning-nanodegree--nd101), [dog-breed classifier](https://github.com/udacity/deep-learning-v2-pytorch/tree/master/project-dog-classification) project.

## Setup

#### Imports

In [1]:
import os
import re
import numpy as np
from glob import glob
from PIL import ImageFile
# Set PIL to be tolerant of image files that are truncated.
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch
import torch.optim as optim
import torch.nn.init as init
import torch.nn as nn
import torch.nn.functional as F

from torchvision import datasets
import torchvision.transforms as transforms
import torchvision.models as models

### Data loading

#### Set up variables

In [2]:
batch_size = 20        # Number of samples per batch in DataLoader
num_workers = 4
n_classes = 133        # Number of outcome classes. Used for network setup
networks = {}          # Holds multiple networks to be trained
TEST_IN_TRAIN = True  # If testing should be done in the training loop. Used for debuging/testing

#### Load data

In [3]:
# load filenames for human and dog images
#human_files = np.array(glob("lfw/*/*"))
#dog_files = np.array(glob("dogImages/*/*/*"))

# Training transforms
image_size = 224 # Network used image size
train_transforms = transforms.Compose([transforms.Resize(image_size),
                                       transforms.RandomCrop(image_size),
                                       transforms.RandomRotation(10),
                                       transforms.RandomHorizontalFlip(p=0.4),
                                       transforms.RandomVerticalFlip(p=0.4),
                                       transforms.RandomGrayscale(p=0.2),
                                       transforms.RandomAffine(30),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.485, 0.456, 0.406), 
                                             (0.229, 0.224, 0.225))])
# Transform for test and validation sets
test_transforms = transforms.Compose([transforms.Resize(image_size),
                                      transforms.RandomCrop(image_size),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.485, 0.456, 0.406), 
                                             (0.229, 0.224, 0.225))])


# Check if CUDA is available
use_cuda = torch.cuda.is_available()

# Data location
data_dir = "dogImages/"
# Train, validation and test data location
train_dir = os.path.join(data_dir,"train/")
valid_dir = os.path.join(data_dir,"valid/")
test_dir = os.path.join(data_dir,"test/")
# Load and transform data
train_data = datasets.ImageFolder(train_dir, transform=train_transforms) + datasets.ImageFolder(train_dir, transform=train_transforms) + datasets.ImageFolder(train_dir, transform=train_transforms) ## Poor accuracy. Increase sample and 
valid_data = datasets.ImageFolder(valid_dir, transform=test_transforms)
test_data = datasets.ImageFolder(test_dir, transform=test_transforms)

loaders_scratch = {}
loaders_scratch["train"] = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
loaders_scratch["valid"] = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
loaders_scratch["test"] = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)

### Helper functions for initialization and testing architectures - `weight_init_normal`, `select_criterion`
`weight_init_normal` (from [here](https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5)) sets network layer weigths to normally distributed, and is optional. `select_criterion` enables fast switching between different loss functions without having to modify network archs.

In [4]:
def weight_init_normal(model):
    '''
    Initialize network weigths to normal random distribution.
    From: https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5
    
    Args:
        model: an initialize model
        
    Returns:
        None
    
    Usage:
        model = Model()
        model.apply(weight_init)
    '''
    if isinstance(model, nn.Conv2d):
        init.xavier_normal_(model.weight.data)
        if model.bias is not None:
            init.normal_(model.bias.data)
    elif isinstance(model, nn.Linear):
        init.xavier_normal_(model.weight.data)
        init.normal_(model.bias.data)
    elif isinstance(model, nn.BatchNorm2d):
        init.normal_(model.weight.data, mean=1, std=0.02)
        init.constant_(model.bias.data, 0)

In [None]:
def select_criterion(lossfn="cross_entropy"):
    '''
    Helper function that selects valid loss and preprocessing (of output according to loss) functions. Some functions need
    data that is preprocessed in various ways, e.g. NLLLoss expects LogSoftMax input, but CrossEntropyLoss includes LogSoftMax.
    This selector simplifies changing loss function without having to rewrite the network arch, when testing various networks
    and loss functions.
    
    Args:
        fn: <fn> string with criterion name as in torch.nn.functional.<fn>
    
    Returns:
        preprocess: preprocess function for CNN output before passed to criterion. Defaults is the linear identity function
        Criterion: torch.nn.<Criterion> loss function corresponding
    
    Example:
        >>> preprocess, criterion = select_criterion("nll_loss")
        >>> train(n_epochs, loaders_scratch, model, optimizer, 
                  criterion, use_cuda, checkpoints[network], lr_scheduler, preprocess)
    '''
    # Linear identity function by default
    preprocess = lambda self : self
    if lossfn == 'cross_entropy':
        Criterion = nn.CrossEntropyLoss()
    elif lossfn == 'mse_loss':
        Criterion = nn.MSELoss()
    elif lossfn == 'nll_loss':
        Criterion = nn.NLLLoss()
        # NLLLoss expects input to be LogSoftMax values
        preprocess = lambda x : F.log_softmax(x, dim=1)
    else:
        print(fn + ' not recognized. Check input or add loss criterion to method. Selected default "cross_entropy".')
        Criterion = nn.CrossEntropyLoss()
    
    return Criterion, preprocess

### Implementations - `train`, `test`

In [None]:
def train(n_epochs, loaders, model, optimizer, criterion=nn.CrossEntropyLoss(),
                                               use_cuda=True,
                                               save_path='model_train_default.pt',
                                               lr_scheduler=None,
                                               preprocess = lambda self: self):
    """
    Model trainer.
    
    Args:
        n_epochs: numer of training iterations of the complete training set
        loaders: dataLoader with data for training
        model: model to train
        optimizer: backward propagation optimizer
        criterion: training criterion
        use_cuda: if cuda should be used for training
        save_path: save path for trained model
        lr_scheduler: learning rate scheduler for changing learning rates during training
        preprocess: preprocess network output for criterion (for more interactive modeling)
        
    Returns: trained model
    """
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf 
    
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for batch_idx, (data, target) in enumerate(loaders['train']):
            # move to GPU 
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            # Reset graditents for training batch
            optimizer.zero_grad()
            # Forward prop
            output = model(data)
            # Preprocess data for criterion
            output = preprocess(output)
            # Compute loss
            loss = criterion(output,target)
            # Compute backprop
            loss.backward()
            # Take step
            optimizer.step()
            # Train loss
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
            
        ######################    
        # validate the model #
        ######################
        model.eval()
        #with torch.no_grad():
        for batch_idx, (data, target) in enumerate(loaders['valid']):
                # move to GPU
                if use_cuda:
                    data, target = data.cuda(), target.cuda()
                ## update the average validation loss
                output = preprocess(output)
                output = model(data)
                loss = criterion(output, target)
                # Validation loss
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
            
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
        
        ## Save the model if validation loss has decreased
        if valid_loss < valid_loss_min:
            torch.save(model.state_dict(), save_path)
            print('Current validation Loss: {:.6f} \tPrevious min validation Loss: {:.6f}'.format(
            valid_loss,
            valid_loss_min
            ))
            valid_loss_min = valid_loss
            print('Lowest validation score so far. Saving current model to: \t ' + re.search('checkpoints/(.*)', save_path).group(1))
            
        # Learning rate scheduler decreases learning rate when learning stagnates. Optional
        if lr_scheduler == None:
            pass
        else:
            lr_scheduler.step(valid_loss_min)
        
        # Test to see training progress. Wastefull here. Only set to true for testing.
        if TEST_IN_TRAIN == True:       # Global variable
            if (epoch % 5 == 0):
                print(' ################## \n ## Test at ' + str(epoch) + ' epochs:  ...')
                test(loaders, model, criterion, use_cuda)
                print(' ################## ')
            
    # return trained model
    return model

In [None]:
def test(loaders, model, criterion=nn.CrossEntropyLoss(),
                         use_cuda=True,
                         preprocess = lambda self: self):
    """
    Test model accuracy and print results.
    
    Args:
        loaders: DataLoader with data for testing
        model: model to test
        criterion: test criterion for accuracy
        use_cuda: if cuda should be used for training
        preprocess: preprocess network output function for criterion (used with criterion switching)
        
    Returns: None
    """
    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.

    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(loaders['test']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the loss
            output = preprocess(output)
            loss = criterion(output, target)
            # update average test loss 
            test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))
            # convert output probabilities to predicted class
            pred = output.data.max(1, keepdim=True)[1]
            # compare predictions to true label
            correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
            total += data.size(0)
            
    print('Test Loss: {:.6f}\n'.format(test_loss))

    print('Test Accuracy: %2d%% (%2d/%2d)\n' % (
        100. * correct / total, correct, total))

## CNN's

### Scratch CNN 1

In [None]:
# Network for testing #VGG16 - based but simplified by dropping layers at the end of the network
class Net_1(nn.Module):
    def __init__(self, n_classes = 133): ## 133 classes in this problem
        super(Net_1, self).__init__()
        # Feature layers
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(4,4)
        )
        self.classifier = nn.Sequential(
            nn.Linear(25088,4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096,2048),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(2048,n_classes)
            #nn.LogSoftmax(dim=1)
        )
        
    def forward(self, x):
        ## Define forward behavior
        x = self.features(x)
        # Flatten input for classifier
        x = x.view(-1, 25088)
        x = self.classifier(x)
        
        return x
    
#-#-# You do NOT have to modify the code below this line. #-#-#

# instantiate the CNN
model_scratch = Net_1(n_classes)
# normally distributed weights
model_scratch.apply(weight_init_normal)

networks['SNet1'] = model_scratch

### Scratch CNN 2
Test of network and inspired by [Udacity Student Hub thread](https://study-hall.udacity.com/rooms/community:nd101:633452-project-300/community:thread-11891619222-594683?contextType=room) by user _Mahmoud H_ and code referenced in [Pastebin](https://pastebin.com/MBSxfqqy).

In [None]:
# Simple network for testing. Just for testing becuse I am getting no training... :(
class Net_2(nn.Module):
    def __init__(self, n_classes = 133):  #n_classes = 133 for my problem
        super(Net_2, self).__init__()
        # Feature layers
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.BatchNorm2d(128),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.BatchNorm2d(256),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.BatchNorm2d(512),
            nn.Conv2d(512, 512, 3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(512),
            nn.MaxPool2d(4,4)
        )
        self.classifier = nn.Sequential(
            nn.Linear(4608,500),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(500,500),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(500,n_classes)
            #nn.LogSoftmax(dim=1)
        )
        
    def forward(self, x):
        # Convolutions
        x = self.features(x)
        # Flatten input for classifier
        x = x.view(-1, 4608)
        # Linear classifier
        x = self.classifier(x)
        
        return x
    
#-#-# You do NOT have to modify the code below this line. #-#-#

# instantiate the CNN
model_scratch = Net_2(n_classes)
# normally distributed weights
model_scratch.apply(weight_init_normal)

networks['SNet2'] = model_scratch

### Untrained VGG16

In [None]:
# VGG16 model
model_scratch = models.vgg16(pretrained=False)

# Replicates VGG16 classifier, but changes outputs and adds nn.LogSoftMax for test loop compatibility
classifier = nn.Sequential(
    nn.Linear(in_features=25088, out_features=4096, bias=True),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=4096, out_features=4096, bias=True),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(in_features=4096, out_features=n_classes, bias=True)
    #nn.LogSoftmax(dim=1)
  )

# Update classifier
model_scratch.classifier = classifier
# normally distributed weights
model_scratch.apply(weight_init_normal)

networks['VGG16'] = model_scratch

### Untrained ResNet18

In [None]:
# ResNet18
model_scratch = models.resnet18(pretrained=False)

from collections import OrderedDict
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(512, 512)),
    ('relu1', nn.ReLU()),
    ('drop1', nn.Dropout()),
    ('fc2', nn.Linear(512, 200)),
    ('relu2', nn.ReLU()),
    ('drop2', nn.Dropout()),
    ('fc3', nn.Linear(200,133))
    #('output', nn.LogSoftmax(dim=1))
]))

# Replace model last layer with this classifier
model_scratch.apply(weight_init_normal)

networks['ResNet18'] = model_scratch

## Train all untrained models sequentially

In [None]:
n_epochs = 20
TEST_IN_TRAIN = True  # If testing should be done in the training loop. Used for debuging/testing
# Stores checkpoint locations
checkpoints = {}
# Select loss function and preprocess function for training and testing loops
loss_fn = "cross_entropy"

for network in networks:
    print('\n ########################################## \n #### \n ####  Working on ' + network + ' network. \n #### \n ########################################## \n')
    model = networks[network]
    # Move model to GPU if possible
    if use_cuda:
        model = model.cuda()
    
    #optimizer_scratch = optim.SGD(model_scratch.parameters(), lr = 0.01, momentum = 0.9, weight_decay = 0.0005, nesterov=True)
    #lr_scheduler_scratch = optim.lr_scheduler.ReduceLROnPlateau(optimizer_scratch, 'min', verbose=True)
    #criterion = nn.CrossEntropyLoss()
    
    # Preprocess output and criterion functions
    criterion, preprocess = select_criterion(loss_fn)
    # Optimizer
    optimizer = optim.Adam(model_scratch.parameters(), lr = 0.01)
    # Change change learning rate if plateaus
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True)
    
    # Checkpoint for best model (validation) for this network an number of epochs
    checkpoints[network] = "checkpoints/model_scratch_" + network + '-' + loss_fn +  '-' + str(n_epochs) + '_epochs.pt'
    
    model = train(n_epochs, loaders_scratch, model, optimizer, 
                  criterion, use_cuda, checkpoints[network], lr_scheduler, preprocess)
    
    # Clean up models
    del model, preprocess, criterion, optimizer, lr_scheduler # Hopefully prevents CUDA from running out of memory all the time


 ########################################## 
 #### 
 ####  Working on SNet1 network. 
 #### 
 ########################################## 



## Test trained models sequentially

In [None]:
for network in networks:
    print('\n ########################################## \n ####  Test ' + network + ' network. \n ########################################## \n')
    model = networks[network]
    # Get current checkpoint name
    checkpoint = checkpoints[network]

    if use_cuda:
        model = model.cuda()
    
    # Get loss function used to train network
    loss_fn = re.search('-(.*)-', checkpoint).group(1)
    
    # Initiate preprocess and criterion functions
    criterion, preprocess = select_criterion(loss_fn)
    # load the model that got the best validation accuracy
    model.load_state_dict(torch.load(checkpoint))
    
    # test function
    test(loaders_scratch, model, criterion, use_cuda, preprocess)
    
    # Hopefully will prevent CUDA from running out of memory all the time
    del model, preprocess, criterion

## Train and testing transfer learning - ResNet18

In [None]:
import torchvision.models as models
# Transfer learning base model architecture 
model_transfer = models.resnet18(pretrained=True)

# Freeze layers
for param in model_transfer.parameters():
    param.requires_grad = False
# Create classifier for classifying dog breed data
from collections import OrderedDict
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(512, 512)),
    ('relu1', nn.ReLU()),
    #('dropout1', nn.Dropout()),
    ('fc2', nn.Linear(512,133)),
    #('output1', nn.LogSoftmax(dim=1))
]))

# Replace model last layer with this classifier
model_transfer.fc = classifier.apply(weight_init_normal)

In [None]:
if use_cuda:
    model_transfer = model_transfer.cuda()

# See training progression iteratively
TEST_IN_TRAIN = True
n_epochs = 20

loaders_transfer = loaders_scratch
# criterion_transfer = nn.NLLLoss() and preprocess data with nn.LogSoftMax(dim=1)
criterion_transfer, preprocess = select_criterion(loss_fn)
# Train the fc parameters, other parameters are frozen
optimizer_transfer = optim.Adam(model_transfer.fc.parameters(), lr=0.01)
# Scheduler for decreasing learning rate on plateau
lr_scheduler_transfer = optim.lr_scheduler.ReduceLROnPlateau(optimizer_transfer, 'min', verbose=True)

checkpoint = "checkpoints/model_transfer_pre_trained_ResNet18" + '-' + loss_fn +  '-' + str(n_epochs) + '_epochs.pt'

# train the model
model_transfer = train(n_epochs, loaders_transfer, model_transfer, optimizer_transfer, 
                       criterion_transfer, use_cuda, checkpoint, lr_scheduler_transfer, preprocess)

# load the model that got the best validation accuracy
model_transfer.load_state_dict(torch.load(checkpoint))

# Test pretrained model
test(loaders_transfer, model_transfer, criterion_transfer, use_cuda, preprocess)