<a href="https://colab.research.google.com/github/asalcedo31/CSC2516_project/blob/master/CSC_2516_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import numpy as np
import torchvision
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy

In [2]:
transform = transforms.Compose(
    [transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=5,
                                         shuffle=False, num_workers=0)
_,trainset = torch.utils.data.random_split(trainset,(49500,500))
# _,trainset = torch.utils.data.random_split(trainset,(49995,5))
print(trainset.__len__())

train_data, val_data = torch.utils.data.random_split(trainset,(int(0.8*len(trainset)),int(0.2*len(trainset))))
print(train_data.__len__(),val_data.__len__() )

trainloader = torch.utils.data.DataLoader(train_data, batch_size=5,
                                          shuffle=True, num_workers=0)
valloader = torch.utils.data.DataLoader(val_data, batch_size=5,
                                          shuffle=True, num_workers=0)

# classes = ('plane', 'car', 'bird', 'cat',
#            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


Files already downloaded and verified
Files already downloaded and verified
500
400 100


In [0]:
image_datasets= {'train': train_data,'val': val_data}
dataloaders = {'train': trainloader, 'val': valloader}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
# class_names = image_datasets['train'].classes


In [0]:
def freeze_layers(model_ft, exclude=[]):
#   children = list(model_ft.named_children())
  for name,param in model_ft.named_parameters():   
    if(name not in  exclude):
      param.requires_grad = False

In [0]:
def countNonZeroWeights(model):
    nonzeros = 0
    weights = 0
    for name,param in model.named_parameters():
        if param is not None:
            nonzeros += torch.sum((param != 0).int()).data[0]
            weights += torch.sum(param).data[0]
    
    return nonzeros, weights

In [0]:
def set_threshold(model):
  for child in model.named_children():    
    for child in child[1].named_children():
      if type(child[1]) == MaskedLinear: 
        child[1].set_threshold()
        print("layer {}  new threshold {:.4f}".format(child[0], child[1].threshold))        

In [0]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
#                 dataloader = train_data_loader
                model.train()  # Set model to training mode
                data_idx = 0
            else:
                model.eval()   # Set model to evaluate mode
#                 dataloader = val_data_loader
                data_idx = 1

            running_loss = 0.0
            running_corrects = 0
            i=0
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
#                 print('i ',i)
#                 i+=1
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
def train_model_prune(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
#                 dataloader = train_data_loader
                model.train()  # Set model to training mode
                data_idx = 0
            else:
                model.eval()   # Set model to evaluate mode
#                 dataloader = val_data_loader
                data_idx = 1

            running_loss = 0.0
            running_corrects = 0
            i=0
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
#                 print('i ',i)
#                 i+=1
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if epoch % 5 == 0 and phase == 'train':
#               countNonZeroWeights(model)
              set_threshold(model)                           
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [0]:
class MaskedLinear(torch.nn.Linear):
  def __init__(self, in_features, out_features, bias=True, threshold=0.1):
    super(MaskedLinear, self).__init__(in_features,out_features)
    self.make_mask(threshold)    
  def make_mask(self, threshold):
    self.mask = torch.ones(self.weight.size(), requires_grad=False).to(device)
    self.zeros = torch.zeros(self.weight.size(), requires_grad=False).to(device)
    self.threshold = threshold    
  def set_threshold(self):
    unique_weights = torch.unique(self.weight*self.mask)
    mask_nonzero = torch.sum(self.mask.view([self.in_features*self.out_features]))
    mask_total = self.in_features*self.out_features
    print('nonzero proportion: {:.4f}'.format(mask_nonzero/mask_total))
    self.threshold = torch.max(torch.topk(torch.abs(unique_weights),int(0.05*unique_weights.size()[0]),largest=False)[0])    
  def mask_weight(self):
    self.mask = torch.where(torch.abs(self.weight) >= self.threshold,self.mask,self.zeros)
    self.weight = torch.nn.Parameter(self.weight*self.mask).to(device)    
  def forward(self, input):
    self.mask_weight()
    return F.linear(input, self.weight, self.bias)

def mask_network(network,layers_to_mask, threshold=0.002, random_init=False):
  """"
  replaces linear layers with masked linear layers
  network is the initial sequential container
  layers is a list of layers to mask
  random init is a logical indicating whether to preserve the initial weights or to modify them
  """
  for name,layer in network.named_children():   
    if int(name) in layers_to_mask:
      if type(layer)== torch.nn.Linear:
        masked_layer = MaskedLinear(layer.in_features, layer.out_features, bias=True,threshold=threshold)
      elif type(layer)== torch.nn.Conv2d:
        masked_layer = MaskedConv(layer.in_channels, layer.out_channels, layer.kernel_size, layer.stride, layer.padding, layer.dilation, layer.groups, bias=True, threshold=threshold)
      if random_init != True:
        masked_layer.weight = copy.deepcopy(layer.weight)
        masked_layer.bias = copy.deepcopy(layer.bias)
      network[int(name)] = masked_layer

In [0]:
class MaskedConv(torch.nn.Conv2d, MaskedLinear):
  def __init__(self, in_channels, out_channels, kernel_size, stride,
                 padding, dilation, transposed, output_padding, groups, bias):
    super(MaskedConv,self).__init__(self, in_channels, out_channels, kernel_size, stride,
                 padding, dilation, transposed, output_padding, groups, bias)
    self.make_mask()    
  def forward(self, input):
    self.mask_weight()
    return F.conv2d(input, self.weight, self.bias, self.stride,
                    self.padding, self.dilation, self.groups)

In [62]:
model_ft = models.vgg16(pretrained=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

freeze_layers(model_ft.features, exclude=['28.weight'])
freeze_layers(model_ft.classifier)

# #baseline
# model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
#                        num_epochs=25)

mask_network(model_ft.features,[28],threshold=0.0001)
set_threshold(model_ft)

NameError: ignored

In [19]:
model_ft = train_model_prune(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=26)

Epoch 0/25
----------
nonzero proportion: 0.9701
layer 0  new threshold 0.0004
nonzero proportion: 0.9549
layer 3  new threshold 0.0011
train Loss: 4.4006 Acc: 0.1875
val Loss: 1.9756 Acc: 0.3100

Epoch 1/25
----------
train Loss: 1.8844 Acc: 0.3675
val Loss: 1.6459 Acc: 0.4800

Epoch 2/25
----------
train Loss: 1.8419 Acc: 0.4075
val Loss: 1.6798 Acc: 0.4000

Epoch 3/25
----------
train Loss: 1.8904 Acc: 0.4150
val Loss: 1.5283 Acc: 0.4500

Epoch 4/25
----------
train Loss: 1.6941 Acc: 0.4700
val Loss: 1.3843 Acc: 0.5200

Epoch 5/25
----------
nonzero proportion: 0.9397
layer 0  new threshold 0.0005
nonzero proportion: 0.9113
layer 3  new threshold 0.0016
train Loss: 1.6711 Acc: 0.4750
val Loss: 1.8568 Acc: 0.4200

Epoch 6/25
----------
train Loss: 1.6418 Acc: 0.4850
val Loss: 1.5815 Acc: 0.4600

Epoch 7/25
----------
train Loss: 1.3771 Acc: 0.5550
val Loss: 1.4078 Acc: 0.4700

Epoch 8/25
----------
train Loss: 1.4829 Acc: 0.5000
val Loss: 1.5295 Acc: 0.4700

Epoch 9/25
----------
tra

KeyboardInterrupt: ignored

In [18]:
model_ft = models.vgg16(pretrained=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

freeze_conv_layers(model_ft)
mask_network(model_ft.classifier,[0,3],threshold=0.0001)
set_threshold(model_ft)


nonzero proportion: 1.0000
layer 0  new threshold 0.0002
nonzero proportion: 1.0000
layer 3  new threshold 0.0005


In [0]:
#testing code ignore
model_ft = models.vgg16(pretrained=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

mask_network(model_ft.classifier,[0,3,6])
# print(model_ft.classifier)
# print(model_ft.classifier[0].weight[120:130,120::130])
print(model_ft.classifier[0].threshold)

In [0]:
#testing code ignore
i=0
for inputs,labels in dataloaders['train']:
  inputs = inputs.to(device)
  model_ft.eval()  
  outputs = model_ft(inputs)
#   print(model_ft.classifier[3].threshold)
 

In [0]:
#baseline
model_ft = models.vgg16(pretrained=True)
# num_ftrs = model_ft.fc.in_features
# model_ft.fc = nn.Linear(num_ftrs, 2)

# model_ft = models.resnet18(pretrained=True)
# num_ftrs = model_ft.fc.in_features
# model_ft.fc = nn.Linear(num_ftrs, 2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)


In [0]:
#baseline
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)

In [48]:
print((model_ft.features))

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(

In [0]:
children = list(model_ft.named_children())
# print(list(children[0][1].named_parameters())[0])
# print(list(children[2][1].named_parameters())[0])
inpurs
print(children[2][1][0].weight)
# for name,param in children[2][1].named_parameters():
#   print(name)
#   print(param)

In [0]:
children = list(model_ft.named_children())
print(list(children[0][1].named_parameters())[0])
print(list(children[2][1].named_parameters())[0])

('0.weight', Parameter containing:
tensor([[[[-5.5373e-01,  1.4270e-01,  5.2896e-01],
          [-5.8312e-01,  3.5655e-01,  7.6566e-01],
          [-6.9022e-01, -4.8019e-02,  4.8409e-01]],

         [[ 1.7548e-01,  9.8630e-03, -8.1413e-02],
          [ 4.4089e-02, -7.0323e-02, -2.6035e-01],
          [ 1.3239e-01, -1.7279e-01, -1.3226e-01]],

         [[ 3.1303e-01, -1.6591e-01, -4.2752e-01],
          [ 4.7519e-01, -8.2677e-02, -4.8700e-01],
          [ 6.3203e-01,  1.9308e-02, -2.7753e-01]]],


        [[[ 2.3254e-01,  1.2666e-01,  1.8605e-01],
          [-4.2805e-01, -2.4349e-01,  2.4628e-01],
          [-2.5066e-01,  1.4177e-01, -5.4864e-03]],

         [[-1.4076e-01, -2.1903e-01,  1.5041e-01],
          [-8.4127e-01, -3.5176e-01,  5.6398e-01],
          [-2.4194e-01,  5.1928e-01,  5.3915e-01]],

         [[-3.1432e-01, -3.7048e-01, -1.3094e-01],
          [-4.7144e-01, -1.5503e-01,  3.4589e-01],
          [ 5.4384e-02,  5.8683e-01,  4.9580e-01]]],


        [[[ 1.7715e-01,  5.2149

https://discuss.pytorch.org/t/find-non-zero-elements-in-a-tensor/4493/2