In [1]:
import copy
import math
import random
import time
from collections import OrderedDict, defaultdict
from typing import Union, List

import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from torch.optim import *
from torch.optim.lr_scheduler import *
from torch.utils.data import DataLoader
from torchprofile import profile_macs
from torchvision.datasets import *
from torchvision.transforms import *
from tqdm.auto import tqdm
import torchvision.models as models

from torchprofile import profile_macs

assert torch.cuda.is_available(), \
"CUDA support is not available."

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f9780528d90>

In [3]:
# def download_url(url, model_dir='.', overwrite=False):
#     import os, sys
#     from urllib.request import urlretrieve
#     target_dir = url.split('/')[-1]
#     model_dir = os.path.expanduser(model_dir)
#     try:
#         if not os.path.exists(model_dir):
#             os.makedirs(model_dir)
#         model_dir = os.path.join(model_dir, target_dir)
#         cached_file = model_dir
#         if not os.path.exists(cached_file) or overwrite:
#             sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
#             urlretrieve(url, cached_file)
#         return cached_file
#     except Exception as e:
#         # remove lock file so download can be executed next time.
#         os.remove(os.path.join(model_dir, 'download.lock'))
#         sys.stderr.write('Failed to download from url %s' % url + '\n' + str(e) + '\n')
#         return None

In [4]:
# class VGG(nn.Module):
#   ARCH = [64, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']

#   def __init__(self) -> None:
#     super().__init__()

#     layers = []
#     counts = defaultdict(int)

#     def add(name: str, layer: nn.Module) -> None:
#       layers.append((f"{name}{counts[name]}", layer))
#       counts[name] += 1

#     last_layer_negative_slope = nn.Parameter(torch.tensor(0.5), requires_grad=False)
#     in_channels = 3
#     i = 0
#     for x in self.ARCH:
#       if x != 'M':
#         # conv-bn-relu
#         add("conv", nn.Conv2d(in_channels, x, 3, padding=1, bias=False))
#         add("bn", nn.BatchNorm2d(x))
#         if i == 9 or i == 6:
#           add("reluleaky", nn.LeakyReLU(negative_slope=last_layer_negative_slope, inplace=True))
#         else:
#           add("relu", nn.ReLU(inplace=True))
#         in_channels = x
#       else:
#         # maxpool
#         add("pool", nn.MaxPool2d(2))
#       i += 1

#     self.backbone = nn.Sequential(OrderedDict(layers))
#     self.classifier = nn.Linear(512, 10)

#   def forward(self, x: torch.Tensor) -> torch.Tensor:
#     # backbone: [N, 3, 32, 32] => [N, 512, 2, 2]
#     x = self.backbone(x)

#     # avgpool: [N, 512, 2, 2] => [N, 512]
#     x = x.mean([2, 3])

#     # classifier: [N, 512] => [N, 10]
#     x = self.classifier(x)
#     return x
  
#   @torch.no_grad()
#   def load_my_state_dict(self, state_dict):
 
#         own_state = self.state_dict()
#         for name, param in state_dict.items():
#             if name not in own_state:
#                  continue
#             if isinstance(param, nn.Parameter):
#                 # backwards compatibility for serialized parameters
#                 param = param.data
#             own_state[name].copy_(param)

#   @torch.no_grad()
#   def change_negative_slope(self, negative_slope: float) -> None:
#     for m in self.modules():
#       if isinstance(m, nn.LeakyReLU):
#         m.negative_slope = nn.Parameter(torch.tensor(negative_slope), requires_grad=False)

In [5]:
# class FinetuneClassifier(nn.Module):
#     def __init__(self, backbone: nn.Module, num_classes: int, num_layers, layer_size, input_dim) -> None:
#         super().__init__()
#         self.backbone = backbone
#         self.layers = [(f'classifier_linear{0}', nn.Linear(input_dim, layer_size))]
#         self.layers.append((f'LeakyReLU{0}', nn.LeakyReLU(negative_slope=nn.Parameter(torch.tensor(0.01), requires_grad=False), inplace=True)))
#         self.num_layers = num_layers
#         self.layer_size = layer_size
#         for i in range(1, num_layers-1):
#             self.layers.append((f'classifier_linear{i}', nn.Linear(layer_size, layer_size)))
#             negative_slope = nn.Parameter(torch.tensor(0.01), requires_grad=False)
#             self.layers.append((f'LeakyReLu{i}', nn.LeakyReLU(negative_slope=negative_slope, inplace=True)))

#         self.final_layer = nn.Linear(layer_size, num_classes)

#         self.classifier = nn.Sequential(OrderedDict(self.layers))

#         self.model = nn.Sequential(OrderedDict([('backbone', self.backbone), ('classifier', self.classifier), ('final_layer', self.final_layer)]))
    
#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         x = self.model(x)
#         return x
    
#     @torch.no_grad()
#     def change_negative_slope(self, negative_slope: float, layer) -> None:
#         for name, m in self.named_modules():
#             if name == 'LeakyReLU0':
#                 m.negative_slope = nn.Parameter(torch.tensor(negative_slope), requires_grad=False)
#             elif name == 'LeakyReLU1':
#                 m.negative_slope = nn.Parameter(torch.tensor(negative_slope), requires_grad=False)


In [6]:
def train(
  model: nn.Module,
  dataloader: DataLoader,
  criterion: nn.Module,
  optimizer: Optimizer,
  scheduler: LambdaLR,
  callbacks = None
) -> None:
  model.train()

  for inputs, targets in tqdm(dataloader, desc='train', leave=False):
    # Move the data from CPU to GPU
    inputs = inputs.cuda()
    targets = targets.cuda()

    # Reset the gradients (from the last iteration)
    optimizer.zero_grad()

    # Forward inference
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    # Backward propagation
    loss.backward()

    # Update optimizer and LR scheduler
    optimizer.step()
    scheduler.step()

    if callbacks is not None:
        for callback in callbacks:
            callback()

In [7]:
@torch.inference_mode()
def evaluate(
  model: nn.Module,
  dataloader: DataLoader, 
  verbose=True,
) -> float:
  # model.eval()

  num_samples = 0
  num_correct = 0

  for inputs, targets in tqdm(dataloader, desc="eval", leave=False, 
                              disable=not verbose):
    # Move the data from CPU to GPU
    inputs = inputs.cuda()
    targets = targets.cuda()

    # Inference
    outputs = model(inputs)

    # Convert logits to class indices
    outputs = outputs.argmax(dim=1)

    # Update metrics
    num_samples += targets.size(0)
    num_correct += (outputs == targets).sum()

  return (num_correct / num_samples * 100).item()

In [8]:
def get_model_macs(model, inputs) -> int:
    return profile_macs(model, inputs)


def get_sparsity(tensor: torch.Tensor) -> float:
    """
    calculate the sparsity of the given tensor
        sparsity = #zeros / #elements = 1 - #nonzeros / #elements
    """
    return 1 - float(tensor.count_nonzero()) / tensor.numel()


def get_model_sparsity(model: nn.Module) -> float:
    """
    calculate the sparsity of the given model
        sparsity = #zeros / #elements = 1 - #nonzeros / #elements
    """
    num_nonzeros, num_elements = 0, 0
    for param in model.parameters():
        num_nonzeros += param.count_nonzero()
        num_elements += param.numel()
    return 1 - float(num_nonzeros) / num_elements

def get_num_parameters(model: nn.Module, count_nonzero_only=False) -> int:
    """
    calculate the total number of parameters of model
    :param count_nonzero_only: only count nonzero weights
    """
    num_counted_elements = 0
    for param in model.parameters():
        if count_nonzero_only:
            num_counted_elements += param.count_nonzero()
        else:
            num_counted_elements += param.numel()
    return num_counted_elements


def get_model_size(model: nn.Module, data_width=32, count_nonzero_only=False) -> int:
    """
    calculate the model size in bits
    :param data_width: #bits per element
    :param count_nonzero_only: only count nonzero weights
    """
    return get_num_parameters(model, count_nonzero_only) * data_width

Byte = 8
KiB = 1024 * Byte
MiB = 1024 * KiB
GiB = 1024 * MiB

In [9]:
# checkpoint_url = "https://hanlab.mit.edu/files/course/labs/vgg.cifar.pretrained.pth"
# checkpoint = torch.load(download_url(checkpoint_url), map_location="cpu")
# print(f"=> loading checkpoint '{checkpoint_url}'")
# model = VGG().cuda()
# model.load_my_state_dict(checkpoint['state_dict'])
model = models.vgg16(pretrained=True)



"backbone.relu6.negative_slope"

In [10]:
image_size = 32
transforms = {
    "train": Compose([
        RandomCrop(image_size, padding=4),
        RandomHorizontalFlip(),
        ToTensor(),
    ]),
    "test": ToTensor(),
}
dataset = {}
for split in ["train", "test"]:
  dataset[split] = CIFAR10(
    root="data/cifar10",
    train=(split == "train"),
    download=True,
    transform=transforms[split],
  )
dataloader = {}
for split in ['train', 'test']:
  dataloader[split] = DataLoader(
    dataset[split],
    batch_size=512,
    shuffle=(split == 'train'),
    num_workers=0,
    pin_memory=True,
  )

Files already downloaded and verified
Files already downloaded and verified


In [11]:
# model.change_negative_slope(0.0)
# model.cpu()
# freeze model
# for param in model.parameters():
#     param.requires_grad = False

In [12]:
# finetuneModel = FinetuneClassifier(model, num_classes=10, num_layers=3, layer_size=512, input_dim=1000).cuda()
finetuneModel = nn.Sequential(OrderedDict([('backbone', model), ('classifier', nn.Linear(1000, 10))])).cuda()

In [13]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [14]:
print(model)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [15]:
num_finetune_epochs = 50
optimizer = torch.optim.SGD(finetuneModel.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_finetune_epochs)
criterion = nn.CrossEntropyLoss()

best_sparse_model_checkpoint = dict()
best_accuracy = 0
print(f'Finetuning Fine-grained Pruned Sparse Model')
for epoch in range(num_finetune_epochs):
    # At the end of each train iteration, we have to apply the pruning mask 
    #    to keep the model sparse during the training
    train(finetuneModel, dataloader['train'], criterion, optimizer, scheduler,
          callbacks=None)
    accuracy = evaluate(finetuneModel, dataloader['test'])
    is_best = accuracy > best_accuracy
    if is_best:
        best_sparse_model_checkpoint['state_dict'] = copy.deepcopy(finetuneModel.state_dict())
        best_accuracy = accuracy
    print(f'    Epoch {epoch+1} Accuracy {accuracy:.2f}% / Best Accuracy: {best_accuracy:.2f}%')

Finetuning Fine-grained Pruned Sparse Model


                                                      

    Epoch 1 Accuracy 59.70% / Best Accuracy: 59.70%


                                                      

    Epoch 2 Accuracy 75.24% / Best Accuracy: 75.24%


                                                      

    Epoch 3 Accuracy 78.73% / Best Accuracy: 78.73%


                                                      

    Epoch 4 Accuracy 81.82% / Best Accuracy: 81.82%


                                                      

    Epoch 5 Accuracy 82.26% / Best Accuracy: 82.26%


                                                      

    Epoch 6 Accuracy 84.88% / Best Accuracy: 84.88%


                                                      

    Epoch 7 Accuracy 84.82% / Best Accuracy: 84.88%


                                                      

    Epoch 8 Accuracy 85.47% / Best Accuracy: 85.47%


                                                      

    Epoch 9 Accuracy 86.28% / Best Accuracy: 86.28%


                                                      

    Epoch 10 Accuracy 86.43% / Best Accuracy: 86.43%


                                                      

    Epoch 11 Accuracy 87.66% / Best Accuracy: 87.66%


                                                      

    Epoch 12 Accuracy 87.65% / Best Accuracy: 87.66%


                                                      

    Epoch 13 Accuracy 88.21% / Best Accuracy: 88.21%


                                                      

    Epoch 14 Accuracy 88.45% / Best Accuracy: 88.45%


                                                      

    Epoch 15 Accuracy 88.54% / Best Accuracy: 88.54%


                                                      

    Epoch 16 Accuracy 89.00% / Best Accuracy: 89.00%


                                                      

    Epoch 17 Accuracy 89.36% / Best Accuracy: 89.36%


                                                      

    Epoch 18 Accuracy 89.33% / Best Accuracy: 89.36%


                                                      

    Epoch 19 Accuracy 89.32% / Best Accuracy: 89.36%


                                                      

    Epoch 20 Accuracy 89.44% / Best Accuracy: 89.44%


                                                      

    Epoch 21 Accuracy 89.70% / Best Accuracy: 89.70%


                                                      

    Epoch 22 Accuracy 89.69% / Best Accuracy: 89.70%


                                                      

    Epoch 23 Accuracy 89.75% / Best Accuracy: 89.75%


                                                      

    Epoch 24 Accuracy 89.83% / Best Accuracy: 89.83%


                                                      

    Epoch 25 Accuracy 89.86% / Best Accuracy: 89.86%


                                                      

    Epoch 26 Accuracy 89.83% / Best Accuracy: 89.86%


                                                      

    Epoch 27 Accuracy 90.46% / Best Accuracy: 90.46%


                                                      

    Epoch 28 Accuracy 90.17% / Best Accuracy: 90.46%


                                                      

    Epoch 29 Accuracy 89.77% / Best Accuracy: 90.46%


                                                      

    Epoch 30 Accuracy 90.19% / Best Accuracy: 90.46%


                                                      

    Epoch 31 Accuracy 89.88% / Best Accuracy: 90.46%


                                                      

    Epoch 32 Accuracy 90.13% / Best Accuracy: 90.46%


                                                      

    Epoch 33 Accuracy 89.89% / Best Accuracy: 90.46%


                                                      

    Epoch 34 Accuracy 90.02% / Best Accuracy: 90.46%


                                                      

    Epoch 35 Accuracy 89.59% / Best Accuracy: 90.46%


                                                      

    Epoch 36 Accuracy 89.28% / Best Accuracy: 90.46%


                                                      

    Epoch 37 Accuracy 89.71% / Best Accuracy: 90.46%


                                                      

    Epoch 38 Accuracy 89.16% / Best Accuracy: 90.46%


                                                      

    Epoch 39 Accuracy 89.62% / Best Accuracy: 90.46%


                                                      

    Epoch 40 Accuracy 89.42% / Best Accuracy: 90.46%


                                                      

    Epoch 41 Accuracy 89.84% / Best Accuracy: 90.46%


                                                      

    Epoch 42 Accuracy 89.27% / Best Accuracy: 90.46%


                                                      

    Epoch 43 Accuracy 89.39% / Best Accuracy: 90.46%


                                                      

    Epoch 44 Accuracy 89.27% / Best Accuracy: 90.46%


                                                      

    Epoch 45 Accuracy 89.61% / Best Accuracy: 90.46%


                                                      

    Epoch 46 Accuracy 90.03% / Best Accuracy: 90.46%


                                                      

    Epoch 47 Accuracy 89.19% / Best Accuracy: 90.46%


                                                      

    Epoch 48 Accuracy 90.08% / Best Accuracy: 90.46%


                                                      

    Epoch 49 Accuracy 89.77% / Best Accuracy: 90.46%


                                                      

    Epoch 50 Accuracy 89.33% / Best Accuracy: 90.46%




In [16]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=89.34%
dense model has size=527.83 MiB




In [17]:
#save model paremeters
torch.save(best_sparse_model_checkpoint, 'best_sparse_model_checkpoint.pth')
# lod sparse model parameters
finetuneModel.load_state_dict(best_sparse_model_checkpoint['state_dict'])

<All keys matched successfully>

In [24]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '4' and isinstance(m3, nn.LeakyReLU):
                        print("got it")
                        m2._modules[name3] = nn.LeakyReLU(negative_slope=nn.Parameter(torch.tensor(0.5), requires_grad=False), inplace=True)
                        # setattr(m3, 'Relu', nn.LeakyReLU(negative_slope=nn.Parameter(torch.tensor(0.1), requires_grad=False), inplace=True))

backbone
features
avgpool
classifier
0
1
2
3
4
got it
5
6
classifier


In [25]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [26]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=74.67%
dense model has size=527.83 MiB




In [27]:
num_finetune_epochs = 5
optimizer = torch.optim.SGD(finetuneModel.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_finetune_epochs)
criterion = nn.CrossEntropyLoss()

best_sparse_model_checkpoint = dict()
best_accuracy = 0
print(f'Finetuning Fine-grained Pruned Sparse Model')
for epoch in range(num_finetune_epochs):
    # At the end of each train iteration, we have to apply the pruning mask 
    #    to keep the model sparse during the training
    train(finetuneModel, dataloader['train'], criterion, optimizer, scheduler,
          callbacks=None)
    accuracy = evaluate(finetuneModel, dataloader['test'])
    is_best = accuracy > best_accuracy
    if is_best:
        best_sparse_model_checkpoint['state_dict'] = copy.deepcopy(finetuneModel.state_dict())
        best_accuracy = accuracy
    print(f'    Epoch {epoch+1} Accuracy {accuracy:.2f}% / Best Accuracy: {best_accuracy:.2f}%')

Finetuning Fine-grained Pruned Sparse Model


                                                      

    Epoch 1 Accuracy 87.72% / Best Accuracy: 87.72%


                                                      

    Epoch 2 Accuracy 88.06% / Best Accuracy: 88.06%


                                                      

    Epoch 3 Accuracy 85.49% / Best Accuracy: 88.06%


                                                      

    Epoch 4 Accuracy 86.87% / Best Accuracy: 88.06%


                                                      

    Epoch 5 Accuracy 88.74% / Best Accuracy: 88.74%




In [28]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=88.72%
dense model has size=527.83 MiB




In [41]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '4' and isinstance(m3, nn.LeakyReLU):
                        print("got it")
                        m2._modules[name3] = nn.LeakyReLU(negative_slope=nn.Parameter(torch.tensor(1), requires_grad=False), inplace=True)

backbone
features
avgpool
classifier
0
1
2
3
4
got it
5
6
classifier


In [42]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [43]:
num_finetune_epochs = 5
optimizer = torch.optim.SGD(finetuneModel.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_finetune_epochs)
criterion = nn.CrossEntropyLoss()

best_sparse_model_checkpoint = dict()
best_accuracy = 0
print(f'Finetuning Fine-grained Pruned Sparse Model')
for epoch in range(num_finetune_epochs):
    # At the end of each train iteration, we have to apply the pruning mask 
    #    to keep the model sparse during the training
    train(finetuneModel, dataloader['train'], criterion, optimizer, scheduler,
          callbacks=None)
    accuracy = evaluate(finetuneModel, dataloader['test'])
    is_best = accuracy > best_accuracy
    if is_best:
        best_sparse_model_checkpoint['state_dict'] = copy.deepcopy(finetuneModel.state_dict())
        best_accuracy = accuracy
    print(f'    Epoch {epoch+1} Accuracy {accuracy:.2f}% / Best Accuracy: {best_accuracy:.2f}%')

Finetuning Fine-grained Pruned Sparse Model


                                                      

    Epoch 1 Accuracy 90.19% / Best Accuracy: 90.19%


                                                      

    Epoch 2 Accuracy 86.05% / Best Accuracy: 90.19%


                                                      

    Epoch 3 Accuracy 89.85% / Best Accuracy: 90.19%


                                                      

    Epoch 4 Accuracy 89.82% / Best Accuracy: 90.19%


                                                      

    Epoch 5 Accuracy 89.38% / Best Accuracy: 90.19%




In [44]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=89.22%
dense model has size=527.83 MiB




In [45]:
w3 = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '3':
                        print("got it")
                        #get weights of mateix of liniear
                        w3 = m3.weight.data

backbone
features
avgpool
classifier
0
1
2
3
got it
4
5
6
classifier


In [49]:
w3.shape

torch.Size([4096, 4096])

In [50]:
w6 = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        #get weights of mateix of liniear
                        w6 = m3.weight.data

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [51]:
w6.shape

torch.Size([1000, 4096])

In [53]:
new_w6 = torch.matmul(w6, w3)
new_w6.shape

torch.Size([1000, 4096])

In [54]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '3' or name3 == '4' or name3 == '5':
                        print("got it")
                        # remover this layer
                        m2._modules[name3] = nn.Identity()


backbone
features
avgpool
classifier
0
1
2
3
got it
4
got it
5
got it
6
classifier


In [55]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        #get weights of mateix of liniear
                        m3.weight.data = new_w6

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [56]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [57]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=88.89%
dense model has size=463.81 MiB




# Now remove the other relu

In [61]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '1' and isinstance(m3, nn.LeakyReLU):
                        print("got it")
                        m2._modules[name3] = nn.LeakyReLU(negative_slope=nn.Parameter(torch.tensor(1), requires_grad=False), inplace=True)

backbone
features
avgpool
classifier
0
1
got it
2
3
4
5
6
classifier


In [62]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [63]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=88.54%
dense model has size=463.81 MiB




In [64]:
num_finetune_epochs = 5
optimizer = torch.optim.SGD(finetuneModel.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_finetune_epochs)
criterion = nn.CrossEntropyLoss()

best_sparse_model_checkpoint = dict()
best_accuracy = 0
print(f'Finetuning Fine-grained Pruned Sparse Model')
for epoch in range(num_finetune_epochs):
    # At the end of each train iteration, we have to apply the pruning mask 
    #    to keep the model sparse during the training
    train(finetuneModel, dataloader['train'], criterion, optimizer, scheduler,
          callbacks=None)
    accuracy = evaluate(finetuneModel, dataloader['test'])
    is_best = accuracy > best_accuracy
    if is_best:
        best_sparse_model_checkpoint['state_dict'] = copy.deepcopy(finetuneModel.state_dict())
        best_accuracy = accuracy
    print(f'    Epoch {epoch+1} Accuracy {accuracy:.2f}% / Best Accuracy: {best_accuracy:.2f}%')

Finetuning Fine-grained Pruned Sparse Model


                                                      

    Epoch 1 Accuracy 88.77% / Best Accuracy: 88.77%


                                                      

    Epoch 2 Accuracy 90.53% / Best Accuracy: 90.53%


                                                      

    Epoch 3 Accuracy 90.61% / Best Accuracy: 90.61%


                                                      

    Epoch 4 Accuracy 90.64% / Best Accuracy: 90.64%


                                                      

    Epoch 5 Accuracy 90.75% / Best Accuracy: 90.75%




In [65]:
w6 = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '0':
                        print("got it")
                        #get weights of mateix of liniear
                        w0 = m3.weight.data

backbone
features
avgpool
classifier
0
got it
1
2
3
4
5
6
classifier


In [66]:
w0.shape

torch.Size([4096, 25088])

In [67]:
w6 = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        #get weights of mateix of liniear
                        w6 = m3.weight.data

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [68]:
w6.shape

torch.Size([1000, 4096])

In [70]:
new_w6 = torch.matmul(w6, w0)

In [71]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '0' or name3 == '1' or name3 == '2':
                        print("got it")
                        # remover this layer
                        m2._modules[name3] = nn.Identity()

backbone
features
avgpool
classifier
0
got it
1
got it
2
got it
3
4
5
6
classifier


In [72]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        #get weights of mateix of liniear
                        m3.weight.data = new_w6

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [73]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=90.81%
dense model has size=151.88 MiB




In [74]:
print(finetuneModel)

Sequential(
  (backbone): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      

In [75]:
w_classifier = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'classifier':
        w_classifier = m.weight.data

backbone
classifier


In [76]:
w_classifier.shape

torch.Size([10, 1000])

In [78]:
w6 = 0
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        #get weights of mateix of liniear
                        w6 = m3.weight.data

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [79]:
w6.shape

torch.Size([1000, 25088])

In [81]:
new_classifier = torch.matmul(w_classifier, w6)

In [82]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'backbone':
        for name2, m2 in m.named_children():
            print(name2)
            if name2 == 'classifier':
                for name3, m3 in m2.named_children():
                    print(name3)
                    if name3 == '6':
                        print("got it")
                        # remover this layer
                        m2._modules[name3] = nn.Identity()

backbone
features
avgpool
classifier
0
1
2
3
4
5
6
got it
classifier


In [85]:
for name, m in finetuneModel.named_children():
    print(name)
    if name == 'classifier':
        m.weight.data = new_classifier

backbone
classifier


In [86]:
dense_model_accuracy = evaluate(finetuneModel, dataloader['test'])
dense_model_size = get_model_size(finetuneModel)
print(f"dense model has accuracy={dense_model_accuracy:.2f}%")
print(f"dense model has size={dense_model_size/MiB:.2f} MiB")

                                                     

dense model has accuracy=90.82%
dense model has size=57.09 MiB




In [89]:
dummy_input = torch.randn(1, 3, 32, 32).cuda()
get_model_macs(finetuneModel, dummy_input)

313472512

In [90]:
# save the model
torch.save(finetuneModel.state_dict(), 'finetuneModel.pth')

In [104]:
# test inference time
import time
finetuneModel.eval()
average_time = 0
with torch.no_grad():
    for i in range(1000):
        start = time.time()
        output = finetuneModel(dummy_input)
        end = time.time()
        average_time += (end - start)

print(f"average inference time is {average_time/1000:.4f} seconds")

average inference time is 0.0011 seconds


In [101]:
model_ = models.vgg16(pretrained=True)
old_finetuneModel = nn.Sequential(OrderedDict([('backbone', model_), ('classifier', nn.Linear(1000, 10))])).cuda()



In [102]:
dummy_input = torch.randn(1, 3, 32, 32).cuda()
get_model_macs(old_finetuneModel, dummy_input)

436865296

In [105]:
old_finetuneModel.eval()
average_time = 0
with torch.no_grad():
    for i in range(1000):
        start = time.time()
        output = old_finetuneModel(dummy_input)
        end = time.time()
        average_time += (end - start)

print(f"average inference time is {average_time/1000:.4f} seconds")

average inference time is 0.0029 seconds
