In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import os
import random
import shutil
import time
import warnings

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from datetime import datetime

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

In [3]:
import sys
sys.path.append('./models')


from main_model import *
import sys 
sys.path.append('/data/users2/rohib/github/testing')
import utils_gsp.sps_tools as sps_tools
import utils_gsp.gpu_projection as gsp_gpu

import sys
sys.path.append('./models')
import models.resnet_torch as ResNet


from models.finetuners import *
from apply_gsp import GSP_Model

In [4]:
# model = models.__dict__['resnet18'](pretrained=False)

In [5]:
class Args:
    data = '/data/users2/rohib/github/imagenet-data'
    arch = 'resnet18'
    workers = 4
    epochs = 1
    start_epoch = 0
    batch_size = 16
    lr = 0.1
    momentum = 0.9
    weight_decay = 1e-4
    print_freq = 10
    resume = ''
    evaluate = False
    pretrained = False
    world_size = -1
    dist_url = 'tcp://224.66.41.62:23456'
    dist_backend = 'nccl'
    seed = None
    gpu = None
    multiprocessing_distributed = False

args = Args

In [6]:
args.multiprocessing_distributed
gsp_func = gsp_gpu
sps = 0.8

In [7]:
def get_model(args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                        'This will turn on the CUDNN deterministic setting, '
                        'which can slow down your training considerably! '
                        'You may see unexpected behavior when restarting '
                        'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                        'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        
        model, train_loader, optimizer, criterion = main_worker(args.gpu, ngpus_per_node, args)
    return model, train_loader, optimizer, criterion

model, train_loader, optimizer, criterion = get_model(args)

=> creating model 'resnet18'
Created model from PyTorch Models! 

In final Clause!


In [8]:
torch.cuda.device_count()
print(args.gpu)
# print(ngpus_per_node)
print(args.multiprocessing_distributed)
print(optimizer.param_groups[0]['lr'])

None
False
0.1


In [9]:
gsp_model = GSP_Model(model)
gsp_model.curr_epoch = 1
gsp_model.curr_iter = 0
gsp_model.gsp_int = 2
gsp_model.sps = 0.9
gsp_model.gsp_training_mode=True

In [10]:
print(f" ## Model SPS: {gsp_model.get_model_sps()[0]:.3f}")
print(f" ## Epoch: {gsp_model.curr_epoch} | Start Epoch: {gsp_model.start_gsp_epoch} | iter: {gsp_model.curr_iter} | TMode: {gsp_model.gsp_training_mode} | gsp_int: {gsp_model.gsp_int}")


 ## Model SPS: 0.041
 ## Epoch: 1 | Start Epoch: 0 | iter: 0 | TMode: True | gsp_int: 2


In [11]:
print(f'Current Epoch: {gsp_model.curr_epoch}')
print(f'Current iter: {gsp_model.curr_iter}')

images, target = next(iter(train_loader))
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)

output = gsp_model.model(images)
gsp_model.apply_gsp()


gsp_model.curr_iter += 1

print(f" ## Epoch: {gsp_model.curr_epoch} | Start Epoch: {gsp_model.start_gsp_epoch} | iter: {gsp_model.curr_iter} | TMode: {gsp_model.gsp_training_mode} | gsp_int: {gsp_model.gsp_int}")

Current Epoch: 1
Current iter: 0


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Applying GSP!!
Sparsity of layer: module.conv1: 0.8305147886276245
Sparsity of layer: module.layer1.0.conv1: 0.8629357218742371
Sparsity of layer: module.layer1.0.conv2: 0.8665708899497986
Sparsity of layer: module.layer1.1.conv1: 0.8702505826950073
Sparsity of layer: module.layer1.1.conv2: 0.8688033223152161
Sparsity of layer: module.layer2.0.conv1: 0.8350777626037598
Sparsity of layer: module.layer2.0.conv2: 0.8448184132575989
Sparsity of layer: module.layer2.0.downsample.0: 0.7633143663406372
Sparsity of layer: module.layer2.1.conv1: 0.8452965617179871
Sparsity of layer: module.layer2.1.conv2: 0.8443495035171509
Sparsity of layer: module.layer3.0.conv1: 0.8247154355049133
Sparsity of layer: module.layer3.0.conv2: 0.8320018649101257
Sparsity of layer: module.layer3.0.downsample.0: 0.7752252817153931
Sparsity of layer: module.layer3.1.conv1: 0.8316736221313477
Sparsity of layer: module.layer3.1.conv2: 0.8320711851119995
Sparsity of layer: module.layer4.0.conv1: 0.817760705947876
Spars

In [13]:
print(sps_tools.get_abs_sps(model)[0])

91.70716450780837


In [None]:
# gsp_in_d = dict()
# for name, layer in model.named_modules():
#     if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
#         w_shape = layer.weight.shape

#         gsp_in_d[name] = layer.weight.data.detach().reshape(layer.weight.shape[0], -1)
        
#         layer.weight.data = gsp_gpu.groupedsparseproj(gsp_in_d[name].T, sps=0.8).T.reshape(w_shape)
#         print(f"Sparsity of layer: {name}: {sps_tools.sparsity(layer.weight.data)}")

# print(f"MODEL SPS: {sps_tools.get_abs_sps(model)[0]}")

In [14]:
def get_conv_linear_mask(model, threshold=1e-8, device=device):
    masks = dict()
    for name, layer in model.named_modules():
        if isinstance(layer, nn.Conv2d) or isinstance(layer, nn.Linear):
            tensor = layer.weight.data
            # Force Values smaller than threshold to 0
            masked_tensor = torch.where(abs(tensor) < threshold, torch.tensor(0.0, device=device), tensor) 
            
            mask = torch.where(abs(tensor) < threshold, torch.tensor(0.0, device=device), torch.tensor(1.0, device=device))
            masks[layer] = mask
            layer.weight.data = masked_tensor
    return masks

In [15]:
masks = get_conv_linear_mask(gsp_model.model)
# list(masks.keys())

In [16]:
# def forward_pre_hook(module, x):
#     module.mask.requires_grad_(False)
#     mask = module.mask
#     module.weight.data.mul_(mask.to(module.weight.get_device()))

In [20]:
gsp_model.register_mask(masks)
# list(masks.keys())

In [None]:
for name, module in model.named_modules():
    if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
        module.mask = nn.Parameter(masks[module]).requires_grad_(False).to(module.weight.get_device())
        module.register_forward_pre_hook(forward_pre_hook)

In [27]:
model = gsp_model.model

torch.nn.parallel.data_parallel.DataParallel

In [None]:
# for name, module in model.named_modules():
#     if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
#         print(module.mask)
#         break

In [25]:
print(f'Current Epoch: {gsp_model.curr_epoch}')
print(f'Current iter: {gsp_model.curr_iter}')

images, target = next(iter(train_loader))
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)

output = gsp_model.model(images)

gsp_model.curr_iter += 1

# print(sps_tools.get_abs_sps(model)[0])

Current Epoch: 1
Current iter: 2




In [23]:
# compute output
loss = criterion(output, target)


# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()



In [26]:
print(f" SPS Model: {sps_tools.get_abs_sps(model)[0]}")

 SPS Model: 91.70768212695901
