In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
import argparse
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import copy

from datetime import datetime

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

In [3]:
!pwd

/private/home/riohib/explore/gsp-for-deeplearning/imagenet


In [4]:
import sys
sys.path.append('./models')


from main_model import *
import sys 
sys.path.append('/data/users2/rohib/github/testing')
import utils_gsp.sps_tools as sps_tools
import utils_gsp.gpu_projection as gsp_gpu

import sys
sys.path.append('./models')
import models.resnet_torch as ResNet


from models.finetuners import *
from apply_gsp import GSP_Model

In [5]:
# model = models.__dict__['resnet18'](pretrained=False)

In [35]:
class Args:
    data = '/datasets01/imagenet_full_size/061417/'
    arch = 'resnet50'
    workers = 4
    epochs = 1
    start_epoch = 0
    batch_size = 16
    lr = 0.1
    momentum = 0.9
    weight_decay = 1e-4
    print_freq = 10
    resume = "/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/mask_stripped_best.pth.tar"
    evaluate = False
    pretrained = False
    world_size = -1
    dist_url = 'tcp://224.66.41.62:23456'
    dist_backend = 'nccl'
    seed = None
    gpu = None
    multiprocessing_distributed = False

args = Args

In [36]:
args.multiprocessing_distributed
gsp_func = gsp_gpu
sps = 0.8

In [87]:
def get_abs_sps(model):
    nonzero = total = 0
    # print(f"TYPE: {type(model)}")

    for name, param in model.named_parameters():
        # print(name)
        tensor = param.detach().clone()
        # nz_count.append(torch.count_nonzero(tensor))
        nz_count = torch.count_nonzero(tensor).item()
        total_params = tensor.numel()
        nonzero += nz_count
        total += total_params
    
    # print(f"TOTAL: {total}")
    abs_sps = 100 * (total-nonzero) / total
    return abs_sps, total, (total-nonzero)

In [62]:
def get_model(args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                        'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function        
        sparse_model, dense_model, train_loader, optimizer, criterion = main_worker(args.gpu, ngpus_per_node, args)
        
    return sparse_model, dense_model, train_loader, optimizer, criterion

In [63]:
chkpt = torch.load("/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/model_best.pth.tar")

In [64]:
def strip_masks_from_model(model_path = "/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/model_best.pth.tar"):
    chkpt = torch.load(model_path)

    for key in list(chkpt['state_dict']):
        if 'mask' in key:
            del chkpt['state_dict'][key]
    
    return chkpt

In [65]:
chkpt = strip_masks_from_model(model_path = "/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/model_best.pth.tar")
torch.save(chkpt, "/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/mask_stripped_best.pth.tar")

In [66]:
sparse_model, dense_model, train_loader, optimizer, criterion = get_model(args)

=> creating model 'resnet50'
Created model from PyTorch Models! 

In final Clause! and no DDP returning
=> loading checkpoint '/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/mask_stripped_best.pth.tar'
=> loaded checkpoint '/private/home/riohib/explore/gsp-for-deeplearning/imagenet/results/gsp_S80_fts90/mask_stripped_best.pth.tar' (epoch 241)


In [71]:
sps_cp_from_dense = copy.deepcopy(dense_model)

In [94]:
params_d = {}
params_l = list()

for name, params in sparse_model.named_parameters():
    params_d[name] = params
    params_l.append(params.data.clone())

In [95]:
name_l = list()
for i, (name, params) in enumerate(sps_cp_from_dense.named_parameters()):
    name_l.append(name)
    # params = params_d[name]
    params.data = params_l[i]
    

In [85]:
# sps_cp_from_dense

In [105]:
get_abs_sps(sps_cp_from_dense)

(89.90632402072353, 25557032, 22977388)

In [97]:
get_abs_sps(sparse_model)

(89.90632402072353, 25557032, 22977388)

In [104]:
get_abs_sps(dense_model)

(0.0, 25557032, 0)

In [102]:
# dense_model

-----
##### CKA

In [20]:
resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)

Using cache found in /private/home/riohib/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [106]:
from torch_cka import CKA
# model1 = resnet50(pretrained=True)  # Or any neural network of your choice
# dense_resnet50 = resnet50
# sparse_resnet50 = model

dataloader = train_loader

cka = CKA(sps_cp_from_dense, dense_model,
          model1_name="dense_resnet50",   # good idea to provide names to avoid confusion
          model2_name="sparse_resnet50",   
        #   model1_layers=layer_names_resnet18, # List of layers to extract features from
        #   model2_layers=layer_names_resnet34, # extracts all layer features by default
          device='cuda')

cka.compare(dataloader) # secondary dataloader is optional

results = cka.export()  # returns a dict that contains model names, layer names
                        # and the CKA matrix

  warn("Model 1 seems to have a lot of layers. " \
  warn("Model 2 seems to have a lot of layers. " \
  warn("Dataloader for Model 2 is not given. Using the same dataloader for both models.")
| Comparing features |:   0%|          | 85/80073 [22:01<345:33:36, 15.55s/it]


KeyboardInterrupt: 

In [16]:
model_gsp = GSP_Model(model) # Make a GSP Model

sps_tools.prune_with_sps(model_gsp.model.module, sparsity = 0.8)
masks_d, masks_l = sps_tools.get_conv_linear_mask(model_gsp.model.module)
model_gsp.register_pre_hook_mask(masks_d) # This for forward pre hook mask registration

Pruning with threshold: 0.0


In [18]:
names = []
for name, layer in model.named_parameters():
    names.append(name)
    

In [27]:
any(['mask' in x for x, _ in model.named_parameters()])

True

In [8]:
def save_model(model, filename, epoch, is_best=False):
    save_checkpoint({
        'epoch': epoch + 1,
        'arch': args.arch,
        'state_dict': model.model.state_dict(),
        'best_acc1': best_acc1,
        'optimizer' : optimizer.state_dict(),
    }, is_best, args, filename=filename)

In [9]:
args.resume = './results/gsp_S80/model_best.pth.tar'

if args.resume:
    if os.path.isfile(args.resume):
        print("=> loading checkpoint '{}'".format(args.resume))
        if args.gpu is None:
            checkpoint = torch.load(args.resume)
        else:
            # Map model to be loaded to specified single gpu.
            loc = 'cuda:{}'.format(args.gpu)
            checkpoint = torch.load(args.resume, map_location=loc)
        args.start_epoch = checkpoint['epoch']
        best_acc1 = checkpoint['best_acc1']
        if args.gpu is not None:
            # best_acc1 may be from a checkpoint from a different GPU
            best_acc1 = best_acc1.to(args.gpu)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])

        print(f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']})")
        print(f"Loaded State Dict: LR: {optimizer.param_groups[0]['lr']:.5f} |" \
                                f"Best @acc1: {checkpoint['best_acc1']}")
        
    else:
        print("=> no checkpoint found at '{}'".format(args.resume))

=> loading checkpoint './results/gsp_S80/model_best.pth.tar'
=> loaded checkpoint './results/gsp_S80/model_best.pth.tar' (epoch 125)
Loaded State Dict: LR: 0.00400 |Best @acc1: 78.52799987792969


In [None]:
torch.cuda.device_count()
print(args.gpu)
# print(ngpus_per_node)
print(args.multiprocessing_distributed)
print(optimizer.param_groups[0]['lr'])

In [None]:
gsp_model = GSP_Model(model)
gsp_model.curr_epoch = 1
gsp_model.curr_iter = 0
gsp_model.gsp_int = 2
gsp_model.sps = 0.95
gsp_model.gsp_training_mode=True

In [None]:
print(f" ## Model SPS: {gsp_model.get_model_sps():.3f}")
print(f" ## Epoch: {gsp_model.curr_epoch} | Start Epoch: {gsp_model.start_gsp_epoch} | iter: {gsp_model.curr_iter} | TMode: {gsp_model.gsp_training_mode} | gsp_int: {gsp_model.gsp_int}")

In [None]:
gsp_model.apply_gsp_to_layers()

In [17]:
# for name, layer in model.named_modules():
#     if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
#         print(f"name: {name} | shape: {layer.weight.shape}")

In [None]:
print(f'Current Epoch: {gsp_model.curr_epoch}')
print(f'Current iter: {gsp_model.curr_iter}')

images, target = next(iter(train_loader))
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)

output = gsp_model.model(images)
# gsp_model.apply_gsp()


gsp_model.curr_iter += 1

print(f" ## Epoch: {gsp_model.curr_epoch} | Start Epoch: {gsp_model.start_gsp_epoch} | iter: {gsp_model.curr_iter} | TMode: {gsp_model.gsp_training_mode} | gsp_int: {gsp_model.gsp_int}")

In [None]:
print(sps_tools.get_abs_sps(model)[0])

In [None]:
next(model.parameters()).device

In [None]:
import math
def prune_with_sps(model, sparsity):
    weight_d = {}
    shape_list = []
    device = next(model.parameters()).device

    weight_tensor = torch.empty(0, device=device)
    for name, param in model.named_parameters(): 
        weight_tensor = torch.cat((weight_tensor, param.data.detach().flatten()))

    wpct_val =  len(weight_tensor) * sparsity
    sorted_weights, indices = torch.sort(weight_tensor.abs())
    threshold = sorted_weights[:math.ceil(wpct_val)+1][-1]

    for name, p in model.named_parameters():
        tensor = p.data
        # print(f'Pruning with threshold : {threshold} for layer {name}')
        sparse_w = torch.where(abs(tensor) < threshold, torch.tensor(0.0, device=device), tensor)
        p.data = sparse_w

In [None]:
prune_with_sps(gsp_model.model, sparsity = 0.8)

In [None]:
def get_conv_linear_mask(model, threshold=1e-8, device=device):
    masks = dict()
    for name, layer in model.named_modules():
        if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
            tensor = layer.weight.data
            # Force Values smaller than threshold to 0
            masked_tensor = torch.where(abs(tensor) < threshold, torch.tensor(0.0, device=device), tensor) 
            
            mask = torch.where(abs(tensor) < threshold, torch.tensor(0.0, device=device), torch.tensor(1.0, device=device))
            masks[layer] = mask
            layer.weight.data = masked_tensor
    return masks

In [None]:
masks = get_conv_linear_mask(gsp_model.model)
# list(masks.keys())

In [None]:
def forward_pre_hook(module, x):
    module.mask.requires_grad_(False)
    mask = module.mask
    module.weight.data.mul_(mask.to(module.weight.get_device()))

In [None]:
gsp_model.register_mask(masks)
# list(masks.keys())

In [None]:
print(f'Current Epoch: {gsp_model.curr_epoch}')
print(f'Current iter: {gsp_model.curr_iter}')

images, target = next(iter(train_loader))
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)

output = gsp_model.model(images)

gsp_model.curr_iter += 1

# print(sps_tools.get_abs_sps(model)[0])

In [None]:
# compute output
loss = criterion(output, target)


# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()



In [None]:
print(f" SPS Model: {sps_tools.get_abs_sps(model)[0]}")