In [60]:
# %mkdir checkpoint best_model 
# creating directories to store checkpoint and best model

In [61]:
# importing Libraries and creating helper functions
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import shutil
import matplotlib.pyplot as plt
import torch
import numpy as np
import torch.nn.functional as F
from torch import nn
from torch import optim
from torchvision import datasets, transforms

In [62]:
# check if CUDA is available
use_cuda = torch.cuda.is_available

In [63]:
# Saving Function 
# save_ckp is created to save checkpoint, the latest one and the best one.

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    '''
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model

    '''
    f_path = checkpoint_path
    # save checkpoint data to the given path, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

epoch: a measure of the number of times all of the training vectors are used once to update the weights.

valid_loss_min: the minimum validation loss, this is needed so that when we continue the training, we can start with this rather than np.Inf value.

state_dict: model architecture information. It includes the parameter matrices for each of the layers.

optimizer: You meed to save optimizer patameters especially when you are using Adam as your optimizer. Adam is adaptive learning rate method, which means, it conputes individual learning rates for different parameters which we woulld need if we want to continue our training from where we left off.


# Loading Function 

In [64]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

load_chkp is created for loading model. It takes:
-location of the saved checkpoint
-model instance that you want to load the state to
-the optimizer

In [65]:
# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])

# Download and load the training data
trainset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=True, transform=transform)

# Download and load the test data
testset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=False, transform=transform)

loaders = {
    'train' : torch.utils.data.DataLoader(trainset,batch_size = 64,shuffle=True),
    'test'  : torch.utils.data.DataLoader(testset,batch_size = 64,shuffle=True),
}

Defining and creating a model

In [66]:
# Define your network ( Simple Example )
class FashionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        input_size = 784
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64,10)
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        x = F.log_softmax(self.fc5(x), dim=1)
        return x

In [67]:

# Create the network, define the criterion and optimizer
model = FashionClassifier()

# move model to GPU if CUDA is available
if use_cuda:
    model = model.cuda()
    
print(model)

FashionClassifier(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


Training the network and saving the model

The train function gives us the ability to set the number of epochs, the learning rate, and other parameters.
define loss function and optimizer
Below, we are using an Adam optimizer and cross entropy loss since we are looking at character class scores as output. We calculate the loss and perform back-propagation.

In [68]:
#define loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Define train method

In [69]:
def train(start_epochs, n_epochs, valid_loss_min_input, loaders, model, optimizer, criterion, use_cuda, checkpoint_path, best_model_path):
    """
    Keyword arguments:
    start_epochs -- the real part (default 0.0)
    n_epochs -- the imaginary part (default 0.0)
    valid_loss_min_input
    loaders
    model
    optimizer
    criterion
    use_cuda
    checkpoint_path
    best_model_path
    
    returns trained model
    """
    # initialize tracker for minimum validation loss
    valid_loss_min = valid_loss_min_input 
    
    for epoch in range(start_epochs, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for batch_idx, (data, target) in enumerate(loaders['train']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            ## find the loss and update the model parameters accordingly
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            ## record the average training loss, using something like
            ## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
        
        ######################    
        # validate the model #
        ######################
        model.eval()
        for batch_idx, (data, target) in enumerate(loaders['test']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            ## update the average validation loss
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
            
        # calculate average losses
        train_loss = train_loss/len(loaders['train'].dataset)
        valid_loss = valid_loss/len(loaders['test'].dataset)

        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
        
        # create checkpoint variable and add important data
        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        
        # save checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
        ## TODO: save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            # save checkpoint as best model
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss
            
    # return trained model
    return model

Train the model

In [70]:
trained_model = train(1, 3, np.Inf, loaders, model, optimizer, criterion, use_cuda, "./checkpoint/current_checkpoint.pt", "./best_model/best_model.pt")

Epoch: 1 	Training Loss: 0.000010 	Validation Loss: 0.000047
Validation loss decreased (inf --> 0.000047).  Saving model ...
Epoch: 2 	Training Loss: 0.000007 	Validation Loss: 0.000039
Validation loss decreased (0.000047 --> 0.000039).  Saving model ...
Epoch: 3 	Training Loss: 0.000006 	Validation Loss: 0.000037
Validation loss decreased (0.000039 --> 0.000037).  Saving model ...


a few parameters we used above:

start_epoch: value start of the epoch for the training

n_epochs: value end of the epoch for the training

valid_loss_min_input = np.Inf

checkpoint_path: full path to save state of latest checkpoint of the training

best_model_path: full path to best state of latest checkpoint of the training

Verify if the model are saved
List down all files in best_model directory

In [71]:
%ls ./best_model/

Parameter format not correct - "best_model".


List down all files in checkpoint directory

In [72]:
%ls ./checkpoint/

Parameter format not correct - "checkpoint".


Loading the model

Reconstruct the model

In [73]:

model = FashionClassifier()

# move model to GPU if CUDA is available
if use_cuda:
    model = model.cuda()
    
print(model)

FashionClassifier(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


Define the optimizer and checkpoint file path

In [74]:
# define optimzer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# define checkpoint saved path
ckp_path = "./checkpoint/current_checkpoint.pt"

Load the model using load_ckp function

In [75]:
# load the saved checkpoint
model, optimizer, start_epoch, valid_loss_min = load_ckp(ckp_path, model, optimizer)

print out the values that we get from load_ckp just to make sure everything is correct

In [76]:
print("model = ", model)
print("optimizer = ", optimizer)
print("start_epoch = ", start_epoch)
print("valid_loss_min = ", valid_loss_min)
print("valid_loss_min = {:.6f}".format(valid_loss_min))

model =  FashionClassifier(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
optimizer =  Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)
start_epoch =  4
valid_loss_min =  3.6626544897444546e-05
valid_loss_min = 0.000037


After we load all the information we need, we can continue training, start_epoch = 4. Previously, we train the model from 1 to 3

Continue Training and/or Inference
Continue training
We can continue to train our model using the train function and provide the values of checkpoint we get from the load_ckp function above

In [77]:
trained_model = train(start_epoch, 6, valid_loss_min, loaders, model, optimizer, criterion, use_cuda, "./checkpoint/current_checkpoint.pt", "./best_model/best_model.pt")


Epoch: 4 	Training Loss: 0.000006 	Validation Loss: 0.000037
Epoch: 5 	Training Loss: 0.000006 	Validation Loss: 0.000036
Validation loss decreased (0.000037 --> 0.000036).  Saving model ...
Epoch: 6 	Training Loss: 0.000005 	Validation Loss: 0.000034
Validation loss decreased (0.000036 --> 0.000034).  Saving model ...


In [78]:
trained_model.eval()

FashionClassifier(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=10, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [79]:
test_acc = 0.0
for samples, labels in loaders['test']:
    with torch.no_grad():
        samples, labels = samples.cuda(), labels.cuda()
        output = trained_model(samples)
        # calculate accuracy
        pred = torch.argmax(output, dim=1)
        correct = pred.eq(labels)
        test_acc += torch.mean(correct.float())
print('Accuracy of the network on {} test images: {}%'.format(len(testset), round(test_acc.item()*100.0/len(loaders['test']), 2)))


Accuracy of the network on 10000 test images: 87.98%


In [81]:
trained_model = train(start_epoch, 9, valid_loss_min, loaders, model, optimizer, criterion, use_cuda, "./checkpoint/current_checkpoint.pt", "./best_model/best_model.pt")

Epoch: 4 	Training Loss: 0.000005 	Validation Loss: 0.000032
Validation loss decreased (0.000037 --> 0.000032).  Saving model ...
Epoch: 5 	Training Loss: 0.000005 	Validation Loss: 0.000033
Epoch: 6 	Training Loss: 0.000005 	Validation Loss: 0.000034
Epoch: 7 	Training Loss: 0.000004 	Validation Loss: 0.000033
Epoch: 8 	Training Loss: 0.000004 	Validation Loss: 0.000032
Validation loss decreased (0.000032 --> 0.000032).  Saving model ...
Epoch: 9 	Training Loss: 0.000004 	Validation Loss: 0.000033
