In [None]:
#CIFAR10 with Knowledge Transfer

#Imports
%matplotlib inline
from copy import deepcopy
from collections import OrderedDict
import gc
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam,lr_scheduler
from torch.utils.data import random_split
import torchvision
from torchvision import transforms,models
import time
import torch.optim as optim


In [None]:
#Load CIFAR 10 Dataset
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(p=.40),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

traindata = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainset,valset = random_split(traindata,[42000,8000])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64,shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,shuffle=True)

classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        base = models.resnet18(pretrained=True)
        self.base = nn.Sequential(*list(base.children())[:-1])
        in_features = base.fc.in_features
        self.drop = nn.Dropout()
        self.final = nn.Linear(in_features,10)
    
    def forward(self,x):
        x = self.base(x)
        x = self.drop(x.view(-1,self.final.in_features))
        return self.final(x)
    
teacher = Model().cuda()
#state_dict = torch.load('checkpoint.pth')
#teacher.load_state_dict(state_dict)



In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, params):
        super(Net, self).__init__()
        self.num_channels = params["num_channels"]
        
        self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(self.num_channels)
        self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(self.num_channels*2)
        self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(self.num_channels*4)

        # 2 fully connected layers to transform the output of the convolution layers to the final output
        self.fc1 = nn.Linear(4*4*self.num_channels*4, self.num_channels*4)
        self.fcbn1 = nn.BatchNorm1d(self.num_channels*4)
        self.fc2 = nn.Linear(4*4*self.num_channels*4*49, 10)      
        self.dropout_rate = params["dropout_rate"]

    def forward(self, s):
        #                                                  -> batch_size x 3 x 32 x 32
        s = self.bn1(self.conv1(s))                         # batch_size x num_channels x 32 x 32
        s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels x 16 x 16
        s = self.bn2(self.conv2(s))                         # batch_size x num_channels*2 x 16 x 16
        s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels*2 x 8 x 8
        s = self.bn3(self.conv3(s))                         # batch_size x num_channels*4 x 8 x 8
        s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels*4 x 4 x 4
        s = s.view(-1, 4*4*self.num_channels*4*49)          # batch_size x 4*4*num_channels*4
        s = self.fc2(s)                                     # batch_size x 10

        return s


def loss_fn(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)


def loss_fn_kd(outputs, labels, teacher_outputs, params):
    alpha = params["alpha"]
    T = 0.01 #params["temperature"]
    KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/T, dim=1),
                             F.softmax(teacher_outputs/T, dim=1)) * (alpha * T * T) + \
              F.cross_entropy(outputs, labels) * (1. - alpha)

    return KD_loss


def accuracy(outputs, labels):
    outputs = np.argmax(outputs, axis=1)
    return np.sum(outputs==labels)/float(labels.size)


# maintain all metrics required in this dictionary- these are used in the training and evaluation loops
metrics = {
    'accuracy': accuracy,
    # could add more metrics such as accuracy for each token type
}

import json
f = """{
    "model_version": "cnn_distill",
    "subset_percent": 1.0,
    "augmentation": "yes",
    "teacher": "resnet18",
    "alpha": 0.9,
    "temperature": 20,
    "learning_rate": 1e-3,
    "batch_size": 128,
    "num_epochs": 30,
    "dropout_rate": 0.5, 
    "num_channels": 32,
    "save_summary_steps": 100,
    "num_workers": 4
}"""
params = json.loads(f)
params["cuda"] = device
print(params)
#student = net.Net(params).cuda()

{'model_version': 'cnn_distill', 'subset_percent': 1.0, 'augmentation': 'yes', 'teacher': 'resnet18', 'alpha': 0.9, 'temperature': 20, 'learning_rate': 0.001, 'batch_size': 128, 'num_epochs': 30, 'dropout_rate': 0.5, 'num_channels': 32, 'save_summary_steps': 100, 'num_workers': 4, 'cuda': device(type='cuda', index=0)}


In [None]:
student1 = Net(params).cuda()
#state_dict = torch.load('checkpoint_5_CNN.pth')
#student1.load_state_dict(state_dict)
#print(student1)
student2 = Net(params).cuda()
print(student2)

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=2048, out_features=128, bias=True)
  (fcbn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100352, out_features=10, bias=True)
)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(student1.parameters(), lr=params["learning_rate"])
lr_scheduler1 = lr_scheduler.StepLR(optimizer1, step_size=100, gamma=0.1)

optimizer2 = optim.Adam(student2.parameters(), lr=params["learning_rate"])
lr_scheduler2 = lr_scheduler.StepLR(optimizer2, step_size=150, gamma=0.1)
states = {}

In [None]:
def train_model(model, trainloader, optimizer):
  best_val_acc = -1000
  best_val_model = None
  for epoch in range(10):  
      print("Epoch -", epoch)
      model.train(True)
      running_loss = 0.0
      running_acc = 0
      for i, data in enumerate(trainloader, 0):
          if i%100 == 0:
            print("Iteration - ", i)
          inputs, labels = data
          inputs, labels = inputs.cuda(),labels.cuda()

          optimizer.zero_grad()
          outputs = model(inputs)
          loss = loss_fn(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item() * inputs.size(0)
          out = torch.argmax(outputs.detach(),dim=1)
          assert out.shape==labels.shape
          running_acc += (labels==out).sum().item()
          if i%100 == 0:
            print(f"Train loss {epoch+1}: {running_loss/len(trainset)},Train Acc:{running_acc*100/len(trainset)}%")
      
      correct = 0
      model.train(False)
      with torch.no_grad():
          for inputs,labels in valloader:
              out = model(inputs.cuda()).cpu()
              out = torch.argmax(out,dim=1)
              acc = (out==labels).sum().item()
              correct += acc
      print(f"Val accuracy:{correct*100/len(valset)}%")
      if correct>best_val_acc:
          best_val_acc = correct
          best_val_model = deepcopy(model.state_dict())
      lr_scheduler1.step()
      
  print('Finished Training')
  return model
#student1 = train_model(student1, trainloader, optimizer1)

In [None]:
def get_predicted_model(trainloader, device, model):
  result = []
  for i, (images, targets) in enumerate(trainloader, 0):
      if i%100 == 0:
          print(i)
      images: torch.Tensor = images.to(device)
      targets: torch.Tensor = targets.to(device)
      result += [model.forward(images).detach().cpu().numpy()]
  return result

teacher_train_ans = get_predicted_model(trainloader, device, teacher)
teacher_val_ans = get_predicted_model(valloader, device, teacher)

0
100
200
300
400
500
600
0
100


In [None]:
def train_model_student(model, trainloader, optimizer, teacher_train_ans, params):
  best_val_acc = -1000
  best_val_model = None
  for epoch in range(10):  
      print("Epoch -", epoch)
      model.train(True)
      running_loss = 0.0
      running_acc = 0
      for i, data in enumerate(trainloader, 0):
          if i%100 == 0:
            print("Iteration - ", i)
          inputs, labels = data
          inputs, labels = inputs.cuda(),labels.cuda()

          optimizer.zero_grad()
          outputs = model(inputs)
          loss = loss_fn_kd(outputs, labels, torch.from_numpy(teacher_train_ans[i]).cuda(), params)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item() * inputs.size(0)
          out = torch.argmax(outputs.detach(),dim=1)
          assert out.shape==labels.shape
          running_acc += (labels==out).sum().item()
          if i%100 == 0:
            print(f"Train loss {epoch+1}: {running_loss/len(trainset)},Train Acc:{running_acc*100/len(trainset)}%")
      
      correct = 0
      model.train(False)
      with torch.no_grad():
          for inputs,labels in valloader:
              out = model(inputs.cuda()).cpu()
              out = torch.argmax(out,dim=1)
              acc = (out==labels).sum().item()
              correct += acc
      print(f"Val accuracy:{correct*100/len(valset)}%")
      if correct>best_val_acc:
          best_val_acc = correct
          best_val_model = deepcopy(model.state_dict())
      lr_scheduler2.step()
      
  print('Finished Training')
  return model
student2 = train_model_student(student2, trainloader, optimizer2, teacher_train_ans, params)

Epoch - 0
Iteration -  0
Train loss 1: 0.00036475372314453123,Train Acc:0.02142857142857143%
Iteration -  100
Train loss 1: 0.13970682364418394,Train Acc:3.8047619047619046%
Iteration -  200
Train loss 1: 0.17424881912413098,Train Acc:8.911904761904761%
Iteration -  300
Train loss 1: 0.2096248653275626,Train Acc:14.14047619047619%
Iteration -  400
Train loss 1: 0.23815665955770585,Train Acc:20.195238095238096%
Iteration -  500
Train loss 1: 0.26504785505930584,Train Acc:26.607142857142858%
Iteration -  600
Train loss 1: 0.29068755199795676,Train Acc:33.20952380952381%
Val accuracy:42.85%
Epoch - 1
Iteration -  0
Train loss 2: 0.00025976258232480006,Train Acc:0.07142857142857142%
Iteration -  100
Train loss 2: 0.024659708954039076,Train Acc:6.761904761904762%
Iteration -  200
Train loss 2: 0.048418007600875126,Train Acc:13.733333333333333%
Iteration -  300
Train loss 2: 0.07180464199611118,Train Acc:20.892857142857142%
Iteration -  400
Train loss 2: 0.09444942508425032,Train Acc:28.2642

In [None]:
torch.save(student2.state_dict(), 'checkpoint_5_CNN_distilled_T_0_0_1.pth')

In [None]:
def get_model_accuracy(model, valloader):
  correct = 0
  model.train(False)
  with torch.no_grad():
    for inputs,labels in valloader:
      out = model(inputs.cuda()).cpu()
      out = torch.argmax(out,dim=1)
      acc = (out==labels).sum().item()
      correct += acc
  print(f"Val accuracy:{correct*100/len(valset)}%")

In [None]:
get_model_accuracy(teacher, valloader)
#get_model_accuracy(student1, valloader)
get_model_accuracy(student2, valloader)

Val accuracy:96.8125%
Val accuracy:67.2%


In [None]:
#Get Teacher Accuracy
param_size = 0
for param in teacher.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in teacher.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 42.691MB


In [None]:
param_size = 0
for param in student1.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in student1.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 5.190MB


In [None]:
param_size = 0
for param in student2.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in student2.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 5.190MB


In [None]:
torch.save(student2.state_dict(), 'checkpoint_5_CNN2.pth')

In [None]:
%%time
#Latency Test
#KD loaded from checkpoint wit T=0.01

model = Net(params).cuda()
state_dict = torch.load('checkpoint_5_CNN_distilled_T_0_0_1.pth')
model.load_state_dict(state_dict)
print(model)

starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = len(testset)
timings=np.zeros((repetitions,1))

epoch=0

correct = 0
model.train(False)
with torch.no_grad():
    starter.record()
    for inputs,labels in testloader:
        out = model(inputs.cuda()).cpu()
        out = torch.argmax(out,dim=1)
        acc = (out==labels).sum().item()
        correct += acc
    ender.record()
    torch.cuda.synchronize()
    curr_time = starter.elapsed_time(ender)
    timings[epoch] = curr_time
    epoch+=1
    
    
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(f'mean prediction latency: {mean_syn}')
print(f"Test accuracy: {correct*100/len(testset)}%")

Net(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=2048, out_features=128, bias=True)
  (fcbn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100352, out_features=10, bias=True)
)
mean prediction latency: 2.4872451171875
Test accuracy: 66.67%
CPU times: user 24.4 s, sys: 642 ms, total: 25 s
Wall time: 24.9 s
