In [20]:
import os
import sys
from tqdm import tqdm
from ranger import Ranger
from tensorboardX import SummaryWriter

import torch
from torch import nn
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.nn.utils.prune as prune
import torch.nn.init as init

import torchvision
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import Normalize
from torchmetrics import Accuracy
import torchvision.utils as vutils

import torch.optim as optim
from cleverhans.torch.attacks.projected_gradient_descent import (projected_gradient_descent)

import quantus
import captum
from captum.attr import Saliency, IntegratedGradients, NoiseTunnel

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import random
import copy
import gc

import warnings
warnings.filterwarnings('ignore')

In [21]:
%run utils.ipynb

In [22]:
writer = SummaryWriter()

In [23]:
# Plotting Style
sns.set_style('darkgrid')

In [24]:
print(torch.cuda.is_available())

True


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
batch_size = 64
train_path = '../datasets/imagenette2/train'
val_path = '../datasets/imagenette2/val'

In [27]:
train_dataloader = DataLoader(datasets.ImageFolder(train_path, transform = transforms.Compose([
                                                                    transforms.RandomResizedCrop(224),
                                                                    transforms.RandomHorizontalFlip(),
                                                                    transforms.ToTensor(),
                                                                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                                        std=[0.229, 0.224, 0.225])
                                                            ])), batch_size = batch_size, shuffle=True)

test_dataloader = DataLoader(datasets.ImageFolder(val_path,
                                                               transform=transforms.Compose([
                                                                   transforms.ToTensor(),
                                                                   transforms.Resize([224, 224]),
                                                                   transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                                        std=[0.229, 0.224, 0.225])
                                                               ])),batch_size=batch_size, shuffle=False)

In [28]:
classes = ('tench', 'springer', 'casette_player', 'chain_saw','church', 'French_horn', 'garbage_truck', 'gas_pump', 'golf_ball', 'parachute')


In [29]:
%run models.ipynb

In [32]:
model = resnet18_features(pretrained=False, filter='None', filter_layer=0).to(device)
learning_rate = 1e-4
start_iter = 0
end_iter = 50
print_freq = 1
valid_freq = 1
prune_type = 'lt'
prune_percent = 10
prune_iterations = 35


In [33]:
model.apply(weight_init)

ResNet_features(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (global_pool): AvgPool2d(kernel_size=7, stride=7, padding=0)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, af

In [34]:
initial_state_dict = copy.deepcopy(model.state_dict())
checkdir(f"{os.getcwd()}/saves/resnet/imagenette/")
torch.save(model, f"{os.getcwd()}/saves/resnet/imagenette/initial_state_dict_lt.pth.tar")

In [35]:
make_mask(model)

In [37]:
# optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4)
# criterion = nn.CrossEntropyLoss() 
criterion = nn.CrossEntropyLoss(reduction="mean").cuda()
optimizer = Ranger(model.parameters(), weight_decay=1e-2, eps = 1e-06)

Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


In [38]:
for name, param in model.named_parameters():
        print(name, param.size())

conv1.weight torch.Size([64, 3, 7, 7])
bn1.weight torch.Size([64])
bn1.bias torch.Size([64])
layer1.0.conv1.weight torch.Size([64, 64, 3, 3])
layer1.0.bn1.weight torch.Size([64])
layer1.0.bn1.bias torch.Size([64])
layer1.0.conv2.weight torch.Size([64, 64, 3, 3])
layer1.0.bn2.weight torch.Size([64])
layer1.0.bn2.bias torch.Size([64])
layer1.1.conv1.weight torch.Size([64, 64, 3, 3])
layer1.1.bn1.weight torch.Size([64])
layer1.1.bn1.bias torch.Size([64])
layer1.1.conv2.weight torch.Size([64, 64, 3, 3])
layer1.1.bn2.weight torch.Size([64])
layer1.1.bn2.bias torch.Size([64])
layer2.0.conv1.weight torch.Size([128, 64, 3, 3])
layer2.0.bn1.weight torch.Size([128])
layer2.0.bn1.bias torch.Size([128])
layer2.0.conv2.weight torch.Size([128, 128, 3, 3])
layer2.0.bn2.weight torch.Size([128])
layer2.0.bn2.bias torch.Size([128])
layer2.0.downsample.0.weight torch.Size([128, 64, 1, 1])
layer2.0.downsample.1.weight torch.Size([128])
layer2.0.downsample.1.bias torch.Size([128])
layer2.1.conv1.weight tor

In [39]:
# Pruning
# NOTE First Pruning Iteration is of No Compression
bestacc = 0.0
best_accuracy = 0
ITERATION = prune_iterations
comp = np.zeros(ITERATION,float)
bestacc = np.zeros(ITERATION,float)
step = 0
all_loss = np.zeros(end_iter,float)
all_accuracy = np.zeros(end_iter,float)
ITE=1

In [40]:
for _ite in range(start_iter, ITERATION):
    if not _ite == 0:
        prune_by_percentile(prune_percent, resample=False, reinit=False)
        original_initialization(mask, initial_state_dict)
        optimizer = Ranger(model.parameters(), lr=learning_rate, weight_decay=1e-2, eps = 1e-06)
        
    print(f"\n--- Pruning Level [{ITE}:{_ite}/{ITERATION}]: ---")

    # Print the table of Nonzeros in each layer
    comp1 = print_nonzeros(model)
    comp[_ite] = comp1
    pbar = tqdm(range(end_iter))

    for iter_ in pbar:

        # Frequency for Testing
        if iter_ % valid_freq == 0:
            accuracy = test(model, test_dataloader, criterion)

            # Save Weights
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                checkdir(f"{os.getcwd()}/saves/resnet/imagenette/")
                torch.save(model,f"{os.getcwd()}/saves/resnet/imagenette/{_ite}_model_lt.pth.tar")

        # Training
        loss = train(model, train_dataloader, optimizer, criterion)
        all_loss[iter_] = loss
        all_accuracy[iter_] = accuracy
        
        # Frequency for Printing Accuracy and Loss
        if iter_ % print_freq == 0:
            pbar.set_description(
                f'Train Epoch: {iter_}/{end_iter} Loss: {loss:.6f} Accuracy: {accuracy:.2f}% Best Accuracy: {best_accuracy:.2f}%')       

    writer.add_scalar('Accuracy/test', best_accuracy, comp1)
    bestacc[_ite]=best_accuracy

    # Plotting Loss (Training), Accuracy (Testing), Iteration Curve
    #NOTE Loss is computed for every iteration while Accuracy is computed only for every {args.valid_freq} iterations. Therefore Accuracy saved is constant during the uncomputed iterations.
    #NOTE Normalized the accuracy to [0,100] for ease of plotting.
    plt.plot(np.arange(1,(end_iter)+1), 100*(all_loss - np.min(all_loss))/np.ptp(all_loss).astype(float), c="blue", label="Loss") 
    plt.plot(np.arange(1,(end_iter)+1), all_accuracy, c="red", label="Accuracy") 
    plt.title(f"Loss Vs Accuracy Vs Iterations (imagenette,resnet)") 
    plt.xlabel("Iterations") 
    plt.ylabel("Loss and Accuracy") 
    plt.legend() 
    plt.grid(color="gray") 
    checkdir(f"{os.getcwd()}/plots/lt/resnet/imagenette/")
    plt.savefig(f"{os.getcwd()}/plots/lt/resnet/imagenette/lt_LossVsAccuracy_{comp1}.png", dpi=1200) 
    plt.close()

    # Dump Plot values
    checkdir(f"{os.getcwd()}/dumps/lt/resnet/imagenette/")
    all_loss.dump(f"{os.getcwd()}/dumps/lt/resnet/imagenette/lt_all_loss_{comp1}.dat")
    all_accuracy.dump(f"{os.getcwd()}/dumps/lt/resnet/imagenette/lt_all_accuracy_{comp1}.dat")
    
    # Dumping mask
    checkdir(f"{os.getcwd()}/dumps/lt/resnet/imagenette/")
    with open(f"{os.getcwd()}/dumps/lt/resnet/imagenette/lt_mask_{comp1}.pkl", 'wb') as fp:
        pickle.dump(mask, fp)
    
    # Making variables into 0
    best_accuracy = 0
    all_loss = np.zeros(end_iter,float)
    all_accuracy = np.zeros(end_iter,float)

# Dumping Values for Plotting
checkdir(f"{os.getcwd()}/dumps/lt/resnet/imagenette/")
comp.dump(f"{os.getcwd()}/dumps/lt/resnet/imagenette/lt_compression.dat")
bestacc.dump(f"{os.getcwd()}/dumps/lt/resnet/imagenette/lt_bestaccuracy.dat")

# Plotting
a = np.arange(prune_iterations)
plt.plot(a, bestacc, c="blue", label="Winning tickets") 
plt.title(f"Test Accuracy vs Unpruned Weights Percentage (imagenette,resnet)") 
plt.xlabel("Unpruned Weights Percentage") 
plt.ylabel("test accuracy") 
plt.xticks(a, comp, rotation ="vertical") 
plt.ylim(0,100)
plt.legend() 
plt.grid(color="gray") 
checkdir(f"{os.getcwd()}/plots/lt/resnet/imagenette/")
plt.savefig(f"{os.getcwd()}/plots/lt/resnet/imagenette/lt_AccuracyVsWeights.png", dpi=1200) 
plt.close()      


--- Pruning Level [1:0/35]: ---
conv1.weight         | nonzeros =    9408 /    9408 (100.00%) | total_pruned =       0 | shape = (64, 3, 7, 7)
bn1.weight           | nonzeros =      64 /      64 (100.00%) | total_pruned =       0 | shape = (64,)
bn1.bias             | nonzeros =       0 /      64 (  0.00%) | total_pruned =      64 | shape = (64,)
layer1.0.conv1.weight | nonzeros =   36864 /   36864 (100.00%) | total_pruned =       0 | shape = (64, 64, 3, 3)
layer1.0.bn1.weight  | nonzeros =      64 /      64 (100.00%) | total_pruned =       0 | shape = (64,)
layer1.0.bn1.bias    | nonzeros =       0 /      64 (  0.00%) | total_pruned =      64 | shape = (64,)
layer1.0.conv2.weight | nonzeros =   36864 /   36864 (100.00%) | total_pruned =       0 | shape = (64, 64, 3, 3)
layer1.0.bn2.weight  | nonzeros =      64 /      64 (100.00%) | total_pruned =       0 | shape = (64,)
layer1.0.bn2.bias    | nonzeros =       0 /      64 (  0.00%) | total_pruned =      64 | shape = (64,)
layer1.1.con

Train Epoch: 49/50 Loss: 1.133616 Accuracy: 47.34% Best Accuracy: 62.19%: 100%|██████| 50/50 [1:07:42<00:00, 81.24s/it]


Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers

--- Pruning Level [1:1/35]: ---
conv1.weight         | nonzeros =    8467 /    9408 ( 90.00%) | total_pruned =     941 | shape = (64, 3, 7, 7)
bn1.weight           | nonzeros =      57 /      64 ( 89.06%) | total_pruned =       7 | shape = (64,)
bn1.bias             | nonzeros =       0 /      64 (  0.00%) | total_pruned =      64 | shape = (64,)
layer1.0.conv1.weight | nonzeros =   33177 /   36864 ( 90.00%) | total_pruned =    3687 | shape = (64, 64, 3, 3)
layer1.0.bn1.weight  | nonzeros =      57 /      64 ( 89.06%) | total_pruned =       7 | shape = (64,)
layer1.0.bn1.bias    | nonzeros =       0 /      64 (  0.00%) | total_pruned =      64 | shape = (64,)
layer1.0.conv2.weight | nonzeros =   33177 /   36864 ( 90.00%) | total_pruned =    3687 | shape = (64, 64, 3, 3)
layer1.0.bn2.weight  | nonzeros =      57 /      64 ( 89.06%) | total_pruned =       7 | shape = (64,)
layer1.0.bn2.b

Train Epoch: 21/50 Loss: 1.444575 Accuracy: 53.20% Best Accuracy: 53.20%:  44%|███▌    | 22/50 [30:31<38:50, 83.24s/it]


KeyboardInterrupt: 