<a href="https://colab.research.google.com/github/asalcedo31/CSC2516_project/blob/master/initial_meta_pruning_framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import numpy as np
import torchvision
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy

In [2]:
transform = transforms.Compose(
    [transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=5,
                                         shuffle=False, num_workers=0)
# _,trainset = torch.utils.data.random_split(trainset,(49200,800))
# _,trainset = torch.utils.data.random_split(trainset,(49995,5))
# print(trainset.__len__())

# train_data, val_data = torch.utils.data.random_split(trainset,(int(0.8*len(trainset)),int(0.2*len(trainset))))
# print(train_data.__len__(),val_data.__len__() )

# trainloader = torch.utils.data.DataLoader(train_data, batch_size=5,
#                                           shuffle=True, num_workers=0)
# valloader = torch.utils.data.DataLoader(val_data, batch_size=5,
#                                           shuffle=True, num_workers=0)


Files already downloaded and verified
Files already downloaded and verified


In [0]:
# image_datasets= {'train': train_data,'val': val_data}
# dataloaders = {'train': trainloader, 'val': valloader}

# dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# class_names = image_datasets['train'].classes


In [0]:
def freeze_layers(model_ft, exclude=[]):
#   children = list(model_ft.named_children())
  for name,param in model_ft.named_parameters():   
    if(name not in  exclude):
      param.requires_grad = False

In [0]:
def countNonZeroWeights(model):
    nonzeros = 0
    weights = 0
    for name,param in model.named_parameters():
        if param is not None:
            nonzeros += torch.sum((param != 0).int()).data[0]
            weights += torch.sum(param).data[0]
    
    return nonzeros, weights

In [0]:
def set_threshold(model,prop=0.05):
  for child in model.named_children():    
    for child in child[1].named_children():
#       print(child)
      if type(child[1]) == MaskedLinear or type(child[1]) == MaskedConv: 
        child[1].set_threshold(prop=prop)
        print("layer {}  new threshold {:.4f}".format(child[0], child[1].threshold))        

In [0]:
def train_model_prune(model, dloaders, dataset_sizes, criterion, optimizer, scheduler,prop=0.05, num_epochs=25, device='cuda',):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    print(len(dloaders['train']))
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
                data_idx = 0
            else:
                model.eval()   # Set model to evaluate mode
                data_idx = 1

            running_loss = 0.0
            running_corrects = 0
            i=0
      
            # Iterate over data.
            for inputs, labels in dloaders[phase]:               
#                 print("batch {} phase {}".format(i, phase))
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                i+=1
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                           
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            if epoch % 5 == 0 and phase == 'train':
              set_threshold(model,prop=prop)   
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
class Masked:
  def make_mask(self, threshold,mask=None):
    if mask is None:
      print("new mask",device)
      self.mask = torch.ones(self.weight.size(), requires_grad=False).to(device)
    else:
      self.mask = mask      
    self.zeros = torch.zeros(self.weight.size(), requires_grad=False).to(device)
    self.threshold = threshold
  def set_threshold(self,prop=0.05):
    unique_weights = torch.unique(self.weight*self.mask)
    mask_size = self.mask.reshape(-1).size()[0]
#     mask_size = mask_size[0]*mask_size[1]
    mask_nonzero = torch.sum(self.mask.view([mask_size]))
    mask_total = mask_size
    print('nonzero proportion: {:.4f}'.format(mask_nonzero/mask_total))
    self.threshold = torch.max(torch.topk(torch.abs(unique_weights),int(prop*unique_weights.size()[0]),largest=False)[0])    
  def make_threshold_mask(self):
    self.mask = torch.where(torch.abs(self.weight) >= self.threshold,self.mask,self.zeros).to(device)
#     self.mask.requires_grad_(requires_grad=False)
  def mask_weight(self):
    self.weight = torch.nn.Parameter(self.weight*self.mask).to(device)    

In [0]:
class MaskedLinear(torch.nn.Linear,Masked):
  def __init__(self, in_features, out_features, bias=True, threshold=0.001,mask=None):
    super(MaskedLinear, self).__init__(in_features,out_features)
    self.make_mask(threshold,mask)
  def forward(self, input):
    self.make_threshold_mask()
    self.mask_weight()
#     print(self.mask[125:135,125:135])
#     print(self.weight[125:135,125:135])
    return F.linear(input, self.weight, self.bias)

class MaskedConv(torch.nn.Conv2d,Masked):
  def __init__(self, in_channels, out_channels, kernel_size, stride,
                 padding, dilation, groups, bias=True,threshold=0.0001):
    super(MaskedConv,self).__init__(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
    self.make_mask(threshold)    
  def forward(self, input):
    self.mask_weight()
    return F.conv2d(input, self.weight, self.bias, self.stride,
                    self.padding, self.dilation, self.groups)

In [0]:
def mask_network(network,layers_to_mask, threshold=0.002, random_init=False, bias=True,masks=None):
  """"
  replaces linear layers with masked linear layers
  network is the initial sequential container
  layers is a list of layers to mask
  random init is a logical indicating whether to preserve the initial weights or to modify them
  """
  for name,layer in network.named_children():   
    if int(name) in layers_to_mask:
      layer_mask = None
      if masks is not None:
        if name in masks:
          layer_mask = masks.get(name)      
      if type(layer)== torch.nn.Linear:
        masked_layer = MaskedLinear(layer.in_features, layer.out_features, bias=bias,threshold=threshold,mask=layer_mask)
      elif type(layer)== torch.nn.Conv2d:
        masked_layer = MaskedConv(layer.in_channels, layer.out_channels, layer.kernel_size, layer.stride, layer.padding, layer.dilation,layer.groups, bias=bias, threshold=threshold)
      if random_init != True:
        masked_layer.weight = copy.deepcopy(layer.weight)
        masked_layer.bias = copy.deepcopy(layer.bias)
      network[int(name)] = masked_layer

In [13]:
def train_meta_prune(model,trainset, outer_steps, num_samples=800, device='cuda'):
  mask_dict = {'0':torch.ones(model.classifier[0].weight.size()).to(device)}
  shuffled_train = torch.utils.data.RandomSampler(trainset)
  train_sample_list = list(torch.utils.data.BatchSampler(shuffled_train,num_samples,False))
  shuffled_train = [x for x in shuffled_train]
  for i in range(outer_steps):
#     train_sample = [trainset[j] for j in train_sample_list[i]] 
    
#     print(len(train_sample))
    _,train_sample = torch.utils.data.random_split(trainset,(49200,800))
    train_data, val_data = torch.utils.data.random_split(train_sample,(int(0.8*num_samples),int(0.2*num_samples)))

    trainloader = torch.utils.data.DataLoader(train_data, batch_size=5,
                                            shuffle=True, num_workers=0)
    valloader = torch.utils.data.DataLoader(val_data, batch_size=5,
                                            shuffle=True, num_workers=0)
    
    subdataloaders = {'train': trainloader, 'val': valloader}
    image_datasets= {'train': train_data,'val': val_data}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
    
    model_ft = models.vgg16(pretrained=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_ft = model_ft.to(device)

    criterion = nn.CrossEntropyLoss()

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

    # freeze_layers(model_ft.features, exclude=['28.weight'])
    freeze_layers(model_ft.features)   
    mask_network(model_ft.classifier,[0],threshold=0.0001,masks=mask_dict)
    model_ft = train_model_prune(model_ft, subdataloaders, dataset_sizes, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=10, prop=0.1)
    mask_dict = {'0':model_ft.classifier[0].mask}
#     set_threshold(model_ft)

#     cost = meta_objective({'train':trainloader, 'val':valoader}, model, optimizer, inner_epochs)


model_ft = models.vgg16(pretrained=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

train_meta_prune(model_ft,trainset,5)

128
Epoch 0/9
----------
train Loss: 3.1251 Acc: 0.2531
nonzero proportion: 0.9832
layer 0  new threshold 0.0005
val Loss: 1.6068 Acc: 0.4438

Epoch 1/9
----------
train Loss: 1.9109 Acc: 0.4062
val Loss: 1.7005 Acc: 0.4500

Epoch 2/9
----------
train Loss: 1.7182 Acc: 0.4547
val Loss: 2.1942 Acc: 0.3250

Epoch 3/9
----------
train Loss: 1.8547 Acc: 0.4031
val Loss: 1.6160 Acc: 0.4813

Epoch 4/9
----------
train Loss: 1.6726 Acc: 0.4766
val Loss: 1.3715 Acc: 0.5125

Epoch 5/9
----------
train Loss: 1.6214 Acc: 0.4906
nonzero proportion: 0.9213
layer 0  new threshold 0.0009
val Loss: 1.5461 Acc: 0.5000

Epoch 6/9
----------
train Loss: 1.5801 Acc: 0.4953
val Loss: 1.5283 Acc: 0.5062

Epoch 7/9
----------
train Loss: 1.3409 Acc: 0.5469
val Loss: 1.3989 Acc: 0.4625

Epoch 8/9
----------
train Loss: 1.2761 Acc: 0.5594
val Loss: 1.3815 Acc: 0.5000

Epoch 9/9
----------
train Loss: 1.2496 Acc: 0.5922
val Loss: 1.2949 Acc: 0.4938

Training complete in 1m 42s
Best val Acc: 0.512500
128
Epoch 0

In [0]:
def run_normal_training_with_pruning(this_trainset):
  model_ft = models.vgg16(pretrained=True)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_ft = model_ft.to(device)

  criterion = nn.CrossEntropyLoss()

  # Observe that all parameters are being optimized
  optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

  # Decay LR by a factor of 0.1 every 7 epochs
  exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

  freeze_layers(model_ft.features, exclude=[])
  mask_network(model_ft.classifier,[0],threshold=0.0001)
  set_threshold(model_ft)
  
  print(this_trainset.__len__())  
  _,mytrainset = torch.utils.data.random_split(this_trainset,(49200,800))
  # _,trainset = torch.utils.data.random_split(trainset,(49995,5))
  print(mytrainset.__len__())

  mytrain_data, myval_data = torch.utils.data.random_split(mytrainset,(int(0.8*len(mytrainset)),int(0.2*len(mytrainset))))
  print(mytrain_data.__len__(),myval_data.__len__() )

  mytrainloader = torch.utils.data.DataLoader(mytrain_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  myvalloader = torch.utils.data.DataLoader(myval_data, batch_size=5,
                                            shuffle=True, num_workers=0)
  mydataloaders = {'train': mytrainloader, 'val': myvalloader}
  image_datasets= {'train': mytrain_data,'val': myval_data}
  dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
  
  
  model_ft = train_model_prune(model_ft, mydataloaders,dataset_sizes, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=2)
  
# run_normal_training_with_pruning(trainset)