In [1]:
%matplotlib inline
from copy import deepcopy
from collections import OrderedDict
import gc
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam,lr_scheduler
from torch.utils.data import random_split
import torchvision
from torchvision import transforms,models
import os


In [2]:
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

In [3]:
train_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(p=.40),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

test_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

traindata = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)
trainset,valset = random_split(traindata,[42000,8000])
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64,shuffle=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=test_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,shuffle=False)

classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
     

In [4]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        base = models.resnet18(pretrained=True)
        self.base = nn.Sequential(*list(base.children())[:-1])
        in_features = base.fc.in_features
        self.drop = nn.Dropout()
        self.final = nn.Linear(in_features,10)
    
    def forward(self,x):
        x = self.base(x)
        x = self.drop(x.view(-1,self.final.in_features))
        return self.final(x)
    
model = Model().cuda()
[x for x,y in model.named_children()]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 131MB/s]


['base', 'drop', 'final']

In [5]:
criterion = nn.CrossEntropyLoss()
param_groups = [
    {'params':model.base.parameters(),'lr':.0001},
    {'params':model.final.parameters(),'lr':.001}
]
optimizer = Adam(param_groups)
lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
states = {}

In [6]:
%%time
best_val_acc = -1000
best_val_model = None
for epoch in range(10):  
    model.train(True)
    running_loss = 0.0
    running_acc = 0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.cuda(),labels.cuda()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item() * inputs.size(0)
        out = torch.argmax(outputs.detach(),dim=1)
        assert out.shape==labels.shape
        running_acc += (labels==out).sum().item()
    print(f"Train loss {epoch+1}: {running_loss/len(trainset)},Train Acc:{running_acc*100/len(trainset)}%")
    
    correct = 0
    model.train(False)
    with torch.no_grad():
        for inputs,labels in valloader:
            out = model(inputs.cuda()).cpu()
            out = torch.argmax(out,dim=1)
            acc = (out==labels).sum().item()
            correct += acc
    print(f"Val accuracy:{correct*100/len(valset)}%")
    if correct>best_val_acc:
        best_val_acc = correct
        best_val_model = deepcopy(model.state_dict())
    lr_scheduler.step()
#     break
print('Finished Training') 

Train loss 1: 0.5050055582977476,Train Acc:82.69047619047619%
Val accuracy:89.2875%
Train loss 2: 0.22471497406846,Train Acc:92.38095238095238%
Val accuracy:93.4%
Train loss 3: 0.19279915895348504,Train Acc:93.57142857142857%
Val accuracy:93.275%
Train loss 4: 0.1886163093390919,Train Acc:93.69761904761904%
Val accuracy:93.575%
Train loss 5: 0.1867631315929549,Train Acc:93.75%
Val accuracy:93.4%
Train loss 6: 0.19008417707397823,Train Acc:93.65%
Val accuracy:93.1125%
Train loss 7: 0.19087958645252953,Train Acc:93.62142857142857%
Val accuracy:93.575%
Train loss 8: 0.18828718749682108,Train Acc:93.72619047619048%
Val accuracy:93.3%
Train loss 9: 0.18838404362258457,Train Acc:93.5904761904762%
Val accuracy:93.525%
Train loss 10: 0.190041633838699,Train Acc:93.68571428571428%
Val accuracy:93.225%
Finished Training
CPU times: user 40min 13s, sys: 2min 54s, total: 43min 8s
Wall time: 27min 4s


In [7]:
model

Model(
  (base): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    

In [8]:
# obtain how large it is
torch.save(model.state_dict(), './resnet18_ori.pth')
size_model = os.path.getsize('./resnet18_ori.pth')
print('model size: {:.3f}MB'.format(size_model/1024**2))

model size: 42.730MB


In [9]:
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = len(testset)
timings=np.zeros((repetitions,1))

In [10]:
%%time
correct = 0
model.load_state_dict(best_val_model)
model.train(False)
with torch.no_grad():
    starter.record()
    for inputs,labels in testloader:
        out = model(inputs.cuda()).cpu()
        out = torch.argmax(out,dim=1)
        acc = (out==labels).sum().item()
        correct += acc
    ender.record()
    torch.cuda.synchronize()
    curr_time = starter.elapsed_time(ender)
    timings[epoch] = curr_time
    # correct += acc
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(f'mean prediction latency: {mean_syn}')
print(f"Test accuracy: {correct*100/len(testset)}%")

mean prediction latency: 1.9767248046875
Test accuracy: 94.15%
CPU times: user 32 s, sys: 3.51 s, total: 35.5 s
Wall time: 19.8 s


# Dynamic quantization

In [11]:
import copy
# state_dict = torch.load('./checkpoint.pth', map_location="cpu")
# print(state_dict.keys())
# model.load_state_dict(state_dict)
model.to(cpu_device)
# Make a copy of the model for layer fusion
fused_model = copy.deepcopy(model)

model.eval()
# The model has to be switched to evaluation mode before any layer fusion.
# Otherwise the quantization will not work correctly.
fused_model.eval()

Model(
  (base): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    

In [12]:
# create a quantized model instance
model_int8 = torch.quantization.quantize_dynamic(
    model,  # the original model
    {torch.nn.Sequential, torch.nn.Linear},  # a set of layers to dynamically quantize
    dtype=torch.qint8)  # the target dtype for quantized weights
model_int8.to(cpu_device)

Model(
  (base): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    

In [13]:
# trainloader.__len__()
# len(trainset)
# testloader.__len__()

In [14]:
len(testset)

10000

In [15]:
%%time
batches=100
number_of_testing_data=(len(testset)/testloader.__len__())*batches
correct = 0
# model.load_state_dict(best_val_model)
model_int8.train(False)
i=0
with torch.no_grad():
    for inputs,labels in testloader:
        if i ==batches: break
        out = model_int8(inputs).cpu()
        out = torch.argmax(out,dim=1)
        acc = (out==labels).sum().item()
        correct += acc
        i+=1
print(f"Test accuracy: {correct*100/number_of_testing_data}%")

Test accuracy: 94.6396%
CPU times: user 5min 27s, sys: 1min 40s, total: 7min 8s
Wall time: 3min 34s


In [16]:
torch.save(model_int8.state_dict(), './resnet18_aft_dy_int8.pth')
size_int8=os.path.getsize('./resnet18_aft_dy_int8.pth')
print('model size: {:.3f}MB'.format(size_int8/1024**2))

model size: 42.715MB


# Static quantization

In [17]:
import torch
from torch.ao.quantization import QConfigMapping
import torch.quantization.quantize_fx as quantize_fx
import copy

model_to_quantize = copy.deepcopy(model).to(cpu_device)
qconfig_mapping = QConfigMapping().set_global(torch.quantization.get_default_qconfig('fbgemm'))
model_to_quantize.eval()
# prepare
model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, input).to(cpu_device)
# calibrate (not shown)
# quantize
model_quantized = quantize_fx.convert_fx(model_prepared).to(cpu_device)

model_quantized



GraphModule(
  (base): Module(
    (0): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=1.0, zero_point=0, padding=(3, 3))
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Module(
      (0): Module(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
      )
      (1): Module(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
      )
    )
    (5): Module(
      (0): Module(
        (conv1): QuantizedConvReLU2d(64, 128, kernel_size=(3, 3), stride=(2, 2), scale=1.0, zero_point=0, padding=(1, 1))
        (conv2): QuantizedConv2d(128, 128, kernel_size=(3, 3),

In [18]:
%%time
batches=100
number_of_testing_data=(len(testset)/testloader.__len__())*batches
correct = 0

model_quantized.train(False)
i=0
with torch.no_grad():
    for inputs,labels in testloader:
        if i ==batches: break
        out = model_quantized(inputs).cpu()
        out = torch.argmax(out,dim=1)
        acc = (out==labels).sum().item()
        correct += acc
        i+=1
print(f"Test accuracy: {correct*850/number_of_testing_data}%")

Test accuracy: 93.9488%
CPU times: user 3min 21s, sys: 5.9 s, total: 3min 27s
Wall time: 1min 43s


In [19]:
import os
torch.save(model_quantized.state_dict(), './checkpoint_static_quantized.pth')
size_static_quantized_in_mb=os.path.getsize("./checkpoint_static_quantized.pth")/1024**2
print('model size: {:.3f}MB'.format(size_static_quantized_in_mb))

model size: 10.790MB
