In [1]:
# PyTorch imports

import torch

import torchvision
from torchvision import transforms, utils, datasets

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler


# Numpy and matplotlib


import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt


# For file operations
import os

# For logging metrics to csv file
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
DRIVE_DIRECTORY = '/content/gdrive/My Drive/Kaggle-IFT6135'
DRIVE_PATH = DRIVE_DIRECTORY + '/%s'

In [4]:
BATCH_SIZE = 128

In [5]:
def test_model(model):
  x = torch.randn(128,1,28,28)
  y = model(x)
  print(y.size())

In [6]:
def params(model):
  print(sum(p.numel() for p in model.parameters())/10e6)

In [7]:
class CNN(nn.Module):
  def __init__(self):
    super(CNN, self).__init__()
    
    self.convnet = nn.Sequential(*[
    nn.Conv2d(1, 64, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(64, 128, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(128, 256, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.Conv2d(256, 256, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),    
    nn.Conv2d(256, 512, kernel_size=3, padding=1),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(512, 512, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),    
    nn.Conv2d(512, 512, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.AvgPool2d(kernel_size=1, stride=1)])

    self.fc = nn.Linear(512, 10)
    
  def forward(self, x):
    out = self.convnet(x)
    out = self.fc(x)
    return out
    
    
    

In [8]:
params(CNN())

0.686465


In [9]:
test_model(CNN())

torch.Size([128, 10])


In [10]:
def get_train_valid_loaders(batch_size=BATCH_SIZE, split_ratio = 5/6, transform = transforms.ToTensor(), shuffle = True):
 
    train_dataset = datasets.MNIST(root=".", train=True, transform=transform, download=True)
    test_dataset = datasets.MNIST(root=".", train=False, transform=transform, download=True)

    num_samples = len(train_dataset)
    indices = list(range(num_samples))
    split_index = int(np.floor(split_ratio * num_samples))
    
    if shuffle:
        np.random.seed(5)
        np.random.shuffle(indices)

    train_index, valid_index = indices[:split_index], indices[split_index:]
    train_sampler = SubsetRandomSampler(train_index)
    valid_sampler = SubsetRandomSampler(valid_index)
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=1, pin_memory=True,
    )
    
    valid_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=valid_sampler,
        num_workers=1, pin_memory=True,
    )
    
    print(" Training set  : {} batches = {} samples".format(len(train_loader),len(train_index)))
    print("Validation set : {} batches = {} samples".format(len(valid_loader),len(valid_index)))
    
    return train_loader, valid_loader   

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
def evaluate_metrics(model,data_loader,criterion):
  
  correct = 0.
  total = 0.
  
  loss = 0.
  
  model.eval()
  with torch.no_grad():
    for data in data_loader:
      
      images, labels = data
      outputs = model(images.to(device))
      
      mini_batch_size = labels.size(0)
      
      loss += mini_batch_size * criterion(outputs, labels.to(device))
                
      _, predicted = torch.max(outputs.data, 1)
      total += mini_batch_size
      correct += (predicted == labels.to(device)).sum().item()
      
  loss = loss/total
  accuracy = correct/total  
  
  return loss, accuracy

In [13]:
def train(experiment_name, train_loader, valid_loader, model, optimizer, criterion, num_epochs,start_epoch):
        
    torch.cuda.manual_seed(10)
    
    model = model.to(device)
    criterion = criterion.to(device)
    #optimizer = optim.SGD(model.parameters(), lr=0.001) 
    

    train_accuracy = []
    valid_accuracy = []
    train_loss = []
    valid_loss = []
    
    state_buffer = []
    
    print("Training for epochs #", num_epochs)
    
    for epoch in range(start_epoch,start_epoch + num_epochs):
        if epoch==31:
            optimizer = optim.SGD(model.parameters(), lr=0.01) 
        if epoch==121: 
            optimizer = optim.SGD(model.parameters(), lr=0.005) 
        if epoch==151:
            optimizer = optim.SGD(model.parameters(), lr=0.001)
            
        # Training set    
            
        model.train()
        for i, data in enumerate(train_loader, 0):
            
            inputs, labels = data
            optimizer.zero_grad()
            
            
            
            outputs = model(inputs.to(device))
            
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()
            
            
        loss_metric, accuracy_metric = evaluate_metrics(model,train_loader,criterion)
        
        train_accuracy.append(accuracy_metric)
        train_loss.append(loss_metric)
        
        # Validation set 
        
        loss_metric, accuracy_metric = evaluate_metrics(model,valid_loader,criterion)
        
                
        valid_accuracy.append(accuracy_metric)
        valid_loss.append(loss_metric)
        
        
        print("======================================================================")
        print('Epoch:', epoch)
        print(' Training  :: Accuracy = %.4f, Loss = %.6f' % (train_accuracy[-1]*100, train_loss[-1]))
        print('Validation :: Accuracy = %.4f, Loss = %.6f' % (valid_accuracy[-1]*100, valid_loss[-1]))
        
        state = {
           "epoch" : epoch,
           "train_accuracy" : train_accuracy[-1]*100,
           "valid_accuracy" : valid_accuracy[-1]*100,
           "train_loss" : train_loss[-1],
           "valid_loss" : valid_loss[-1],            
        }
        
        state_buffer.append(state)
        
        if epoch%5 == 0 or epoch == num_epochs-1 :
          path = os.path.join("{}/models/{}".format(DRIVE_DIRECTORY,experiment_name), "{}-e{}.pth".format(experiment_name,epoch))
          torch.save(model.state_dict(), path)
          
          df = pd.DataFrame(state_buffer)
          
          state_file_name = '{}/results/{}.csv'.format(DRIVE_DIRECTORY,experiment_name)
          
          if not os.path.isfile(state_file_name):
            df.to_csv(state_file_name, index = False)
          else: 
            df.to_csv(state_file_name, mode='a', header=False, index = False)
                      
          state_buffer.clear()
          
        
    return model #, train_accuracy, train_loss, valid_accuracy, valid_loss   

In [14]:
def run_experiment(name, learning_rate, num_epochs, model, batch_size = BATCH_SIZE,start_epoch=0):
  
  experiment_name = "%s[lr=%.3f][e=%d][b=%d]" % (name,learning_rate,num_epochs,batch_size)
  
  print("Running experiment : ",experiment_name)
  
  model_directory_name = "{}/models/{}".format(DRIVE_DIRECTORY,experiment_name)
  
  if not os.path.exists(model_directory_name):
        os.mkdir(model_directory_name)
  
  train_loader, valid_loader = get_train_valid_loaders(batch_size = batch_size)
  
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=learning_rate)     
  
  model = train(experiment_name, train_loader, valid_loader, model, optimizer, criterion, num_epochs,start_epoch)

  return model

In [15]:
model = run_experiment("CNN",0.1,100,CNN())

Running experiment :  CNN[lr=0.100][e=100][b=128]
 Training set  : 391 batches = 50000 samples
Validation set : 79 batches = 10000 samples
Training for epochs # 100
Epoch: 0
 Training  :: Accuracy = 11.1980, Loss = 2.300599
Validation :: Accuracy = 11.4300, Loss = 2.300683
Epoch: 1
 Training  :: Accuracy = 17.3880, Loss = 2.219637
Validation :: Accuracy = 17.5900, Loss = 2.219054
Epoch: 2
 Training  :: Accuracy = 92.3060, Loss = 0.239595
Validation :: Accuracy = 91.9200, Loss = 0.244834
Epoch: 3
 Training  :: Accuracy = 96.7920, Loss = 0.101746
Validation :: Accuracy = 96.3700, Loss = 0.117804
Epoch: 4
 Training  :: Accuracy = 97.6660, Loss = 0.072182
Validation :: Accuracy = 97.4400, Loss = 0.086756
Epoch: 5
 Training  :: Accuracy = 98.6600, Loss = 0.043841
Validation :: Accuracy = 98.0600, Loss = 0.060815
Epoch: 6
 Training  :: Accuracy = 97.2960, Loss = 0.080801
Validation :: Accuracy = 96.8500, Loss = 0.103083
Epoch: 7
 Training  :: Accuracy = 99.0800, Loss = 0.028346
Validation ::

Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: ignored