This notebook has custom AlexNet. It measures:


1.   Sparsity of weights(one-time)
2.   Layerwise CONV layer activation sparsities
3.   Accuracy of the model
4. Layerwise #MAC ops



# imports

In [21]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import time

# Device configuration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

# Data

In [2]:
def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
    ])
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
        ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=train_transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=valid_transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)
 
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,
                    batch_size,
                    shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )

    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )

    return data_loader


# CIFAR10 dataset 
train_loader, valid_loader = get_train_valid_loader(data_dir = './data', batch_size = 1, augment = False, random_seed = 1)
test_loader = get_test_loader(data_dir = './data', batch_size = 1)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


# Model

**Custom conv2d function**

In [3]:
def myconv2d(input, weight, bias=None, stride=(1,1), padding=(0,0), dilation=(1,1), groups=1):
    """
    Function to process an input with a standard convolution
    """
    mul_count = 0
    # print('input', input.shape)
    # print('wt', weight.shape)
    batch_size, in_channels, in_h, in_w = input.shape
    out_channels, in_channels, kh, kw = weight.shape
    out_h = int((in_h - kh + 2 * padding[0]) / stride[0] + 1)
    out_w = int((in_w - kw + 2 * padding[1]) / stride[1] + 1)
    unfold = torch.nn.Unfold(kernel_size=(kh, kw), dilation=dilation, padding=padding, stride=stride)
    inp_unf = unfold(input)
    w_ = weight.view(weight.size(0), -1).t()
    if bias is None:
        out_unf = inp_unf.transpose(1, 2).matmul(w_).transpose(1, 2)
        mul_count += 1
    else:
        out_unf = (inp_unf.transpose(1, 2).matmul(w_) + bias).transpose(1, 2)
        mul_count += 1
    out = out_unf.view(batch_size, out_channels, out_h, out_w)
    return (out.float(), mul_count)
    # return out.float()
    
###################################################################################################### 

class comp_vector():
  def __init__(self, arr):
    self.x = arr.size(dim=2)
    self.y = arr.size(dim=1)
    self.c = arr.size(dim=0)
    self.index_vector = []
    self.data_vector = []
    for i in range(self.c):
      # print(arr[i])
      self.index_vector.append(np.flatnonzero(arr[i].cpu()))
      self.data_vector.append(arr[i].ravel()[self.index_vector[-1]])

    # index_vector = np.flatnonzero(arr)
    # data_vector = arr.ravel()[index_vector]

  def get_index_vector(self):
    return self.index_vector

  def get_data_vector(self):
    return self.data_vector


def conv_compressed(comp_inp, comp_wt):
    acc_x, acc_y, acc_c = int((comp_inp.x - comp_wt.x)//stride  + 1) , int((comp_inp.y - comp_wt.y)//stride  +1), comp_wt.c
#     print(acc_x, acc_y, acc_c)
    mult_count = 0
    # print(acc_x, acc_y, acc_c)
    acc_buf = torch.FloatTensor(acc_x, acc_y).zero_()
    inp_index_vector = comp_inp.get_index_vector()
    inp_data_vector = comp_inp.get_data_vector()
    wt_index_vector = comp_wt.get_index_vector()
    wt_data_vector = comp_wt.get_data_vector()
    # print(inp_index_vector[0])
    # print(len(inp_index_vector[0]))
    for c in range(acc_c):
      for i in range(len(inp_index_vector[c])):
        for j in range(len(wt_index_vector[c])):
          inp_x = inp_index_vector[c][i]//comp_inp.x
          inp_y = inp_index_vector[c][i]%comp_inp.y
          wt_x = wt_index_vector[c][j]//comp_wt.x
          wt_y = wt_index_vector[c][j]%comp_wt.y

          out_x = (inp_x - wt_x)
          out_y = (inp_y- wt_y)
          if out_x%stride==0 and out_y%stride==0:
            out_x = out_x//stride
            out_y = out_y//stride
            # print(out_x, out_y,c,i,j,)
            if 0<=out_x<acc_x and 0<=out_y<acc_y:
              # print("yes")
              acc_buf[out_x][out_y]+=float(inp_data_vector[c][i] * wt_data_vector[c][j])
              mult_count +=1
    
    return (acc_buf,mult_count)

def myconv2d_sparse(input, weight, bias=None, stride=(1,1), padding=(0,0), dilation=(1,1), groups=1):
  input = torch.nn.functional.pad(input, (padding[1], padding[1], padding[0], padding[0]), "constant", 0)
  print(input.size())
  comp_in = comp_vector(input[0])
  in_x = input.size(dim=3)
  in_y = input.size(dim=2)
  wt_x = weight.size(dim=3)
  wt_y = weight.size(dim=2)
  c = weight.size(dim=1)
  k = weight.size(dim=0)
  out = torch.empty(size=(1,k, int((in_x-wt_x)/stride[0]+1), int((in_y-wt_y)/stride[1]+1)))

  mult_count = 0
  for i in range(k):
    comp_wt = comp_vector(weight[i])
    out[0][i], num =conv_compressed(comp_in, comp_wt, stride[0])
    mult_count+=num
  
  return (out,mult_count)


**Defining custom Conv2D Layer**

In [13]:
class Custom_Conv2d(torch.nn.modules.conv._ConvNd):
    """
    Implements a standard convolution layer that can be used as a regular module
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1,
                 bias=True, padding_mode='zeros'):
        kernel_size = (kernel_size, kernel_size)
        stride = (stride, stride)
        padding = (padding, padding)
        dilation = (dilation, dilation)
        super(Custom_Conv2d, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            False, (0, 0), groups, bias, padding_mode)

    def conv2d_forward(self, input, weight):
        return myconv2d_sparse(input, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)

    def forward(self, input):
        return self.conv2d_forward(input, self.weight)


**Defining custom AlexNet**

In [14]:
# empty arrays for storing activation sparsities
c1 = []
c2 = []
c3 = []
c4 = []
c5 = []

In [15]:
# empty arrays for storing #MACops per layer
m1 = []
m2 = []
m3 = []
m4 = []
m5 = []

In [16]:
class CustomAlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(CustomAlexNet, self).__init__()
        self.features = nn.Sequential(
            Custom_Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2),
            Custom_Conv2d(64, 192, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2),
            Custom_Conv2d(192, 384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            Custom_Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            Custom_Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        
        self.avgpool = nn.AdaptiveAvgPool2d(output_size=(6, 6))

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(4096, 1024),
            nn.ReLU(),
            nn.Linear(1024, num_classes))
        
    def forward(self, x):
        # print('Sparsity of CONV1 activations: ', (1 - torch.count_nonzero(x)/torch.numel(x)).item())
        c1.append((1 - torch.count_nonzero(x)/torch.numel(x)).item())
        out, macops1 = self.features[0](x)             
        m1.append(macops1)
        out = self.features[1](out)
        out = self.features[2](out)
        
        # print('Sparsity of CONV2 activations: ', (1 - torch.count_nonzero(out)/torch.numel(out)).item())
        c2.append((1 - torch.count_nonzero(out)/torch.numel(out)).item())
        out, macops2 = self.features[3](out)
        m2.append(macops2)
        out = self.features[4](out)
        out = self.features[5](out)
        
        # print('Sparsity of CONV3 activations: ', (1 - torch.count_nonzero(out)/torch.numel(out)).item())
        c3.append((1 - torch.count_nonzero(out)/torch.numel(out)).item())
        out, macops3 = self.features[6](out)
        m3.append(macops3)
        out = self.features[7](out)
        
        # print('Sparsity of CONV4 activations: ', (1 - torch.count_nonzero(out)/torch.numel(out)).item())
        c4.append((1 - torch.count_nonzero(out)/torch.numel(out)).item())
        out, macops4 = self.features[8](out)
        m4.append(macops4)
        out = self.features[9](out)
        
        c5.append((1 - torch.count_nonzero(out)/torch.numel(out)).item())
        # print('Sparsity of CONV5 activations: ', (1 - torch.count_nonzero(out)/torch.numel(out)).item())
        out, macops5 = self.features[10](out)
        m5.append(macops5)
        out = self.features[11](out)
        out = self.features[12](out)

        out = self.avgpool(out)
        out = out.reshape(out.size(0), -1)
        out = self.classifier(out)
        return out



**Instantiating a custom AlexNet**

In [17]:
cust_alexnet = CustomAlexNet(10)

**Loading pretrained AlexNet weights into our custom AlexNet**

In [18]:
# alexnet = torch.load(r'alexnet.pth')
cust_alexnet.load_state_dict(torch.load(r'alexnet.pth'))
cust_alexnet.to(device)
cust_alexnet.eval()

CustomAlexNet(
  (features): Sequential(
    (0): Custom_Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Custom_Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Custom_Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Custom_Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Custom_Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU()
    

# Measurements

**Accuracy & Layer-wise Activation Sparsities(Dynamic)**

In [19]:
def test_model(cust_alexnet):
  cust_alexnet.to(device)
  cust_alexnet.eval()
  correct = 0
  total = 0
  m1.clear()
  m2.clear()
  m3.clear()
  m4.clear()
  m5.clear()
  c1.clear()
  c2.clear()
  c3.clear()
  c4.clear()
  c5.clear()
  with torch.no_grad():
      for data in test_loader:
          if(total>=1):
            break
          images, labels = data[0].to(device), data[1].to(device)
          outputs = cust_alexnet(images)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print('Accuracy of the network on the 10000 test images: %.2f %%' % (100 * correct / total))
  print("CONV1 #MACops(avg):", sum(m1)/len(m1))
  print("CONV2 #MACops(avg):", sum(m2)/len(m2))
  print("CONV3 #MACops(avg):", sum(m3)/len(m3))
  print("CONV4 #MACops(avg):", sum(m4)/len(m4))
  print("CONV5 #MACops(avg):", sum(m5)/len(m5))

  print("CONV1 activation sparsity(avg):", sum(c1)/len(c1))
  print("CONV2 activation sparsity(avg):", sum(c2)/len(c2))
  print("CONV3 activation sparsity(avg):", sum(c3)/len(c3))
  print("CONV4 activation sparsity(avg):", sum(c4)/len(c4))
  print("CONV5 activation sparsity(avg):", sum(c5)/len(c5))
  # sum(c2)/len(c2)
  print(len(c1))

In [22]:
start = time.time()
test_model(cust_alexnet)
print(time.time()-start)

KeyboardInterrupt: 

**Weight Sparsity (static)**

In [10]:
def get_alex_w_sparsities(cust_alexnet):
  conv_indices = [0, 3, 6, 8, 10]

  for i in range(5):
    layer_index = conv_indices[i]

    print(
        "Sparsity in conv{:}.weight: {:.2f}%".format(i+1, 
            100. * float(torch.sum(cust_alexnet.features[layer_index].weight == 0))
            / float(cust_alexnet.features[layer_index].weight.nelement())
        )
    )

In [14]:
get_alex_w_sparsities(cust_alexnet)

Sparsity in conv1.weight: 0.00%
Sparsity in conv2.weight: 0.00%
Sparsity in conv3.weight: 0.00%
Sparsity in conv4.weight: 0.00%
Sparsity in conv5.weight: 0.00%


# Pruning

In [11]:
import torch.nn.utils.prune as prune

In [12]:
# percs = [.25, .50, .75, ]
# new_model = LeNet()
# for name, module in new_model.named_modules():
#     # prune 20% of connections in all 2D-conv layers
#     if isinstance(module, torch.nn.Conv2d):
#         prune.l1_unstructured(module, name='weight', amount=0.2)
#     # prune 40% of connections in all linear layers
#     elif isinstance(module, torch.nn.Linear):
#         prune.l1_unstructured(module, name='weight', amount=0.4)

In [33]:
alex_90 = CustomAlexNet(10)
alex_90.load_state_dict(torch.load(r'alexnet.pth'))

for name, module in alex_90.named_modules():
    # prune 90% of connections in all 2D-conv layers
    if isinstance(module, Custom_Conv2d):
        prune.l1_unstructured(module, name='weight', amount=0.25)

CustomAlexNet(
  (features): Sequential(
    (0): Custom_Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Custom_Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Custom_Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Custom_Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Custom_Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU()
    

In [36]:
get_alex_w_sparsities(alex_90)

Sparsity in conv1.weight: 25.00%
Sparsity in conv2.weight: 25.00%
Sparsity in conv3.weight: 25.00%
Sparsity in conv4.weight: 25.00%
Sparsity in conv5.weight: 25.00%


In [37]:
test_model(alex_90)

Accuracy of the network on the 10000 test images: 83.35 %
CONV1 #MACops(avg): 1.0
CONV2 #MACops(avg): 1.0
CONV3 #MACops(avg): 1.0
CONV4 #MACops(avg): 1.0
CONV5 #MACops(avg): 1.0
CONV1 activation sparsity(avg): 0.0
CONV2 activation sparsity(avg): 0.4914952580094337
CONV3 activation sparsity(avg): 0.7596199704170227
CONV4 activation sparsity(avg): 0.8848602611422539
CONV5 activation sparsity(avg): 0.8694982788264751
10000


### Global Pruning

In [13]:
model = CustomAlexNet(10)
model.load_state_dict(torch.load(r'alexnet.pth'))
model.to(device)
model.eval()

parameters_to_prune = (
    (model.features[0], 'weight'),
    (model.features[3], 'weight'),
    (model.features[6], 'weight'),
    (model.features[8], 'weight'),
    (model.features[10], 'weight'),
)

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.25,
)

In [14]:
get_alex_w_sparsities(model)

Sparsity in conv1.weight: 9.78%
Sparsity in conv2.weight: 24.24%
Sparsity in conv3.weight: 23.41%
Sparsity in conv4.weight: 26.49%
Sparsity in conv5.weight: 25.55%


In [15]:
test_model(model)

Accuracy of the network on the 10000 test images: 83.46 %
CONV1 #MACops(avg): 1.0
CONV2 #MACops(avg): 1.0
CONV3 #MACops(avg): 1.0
CONV4 #MACops(avg): 1.0
CONV5 #MACops(avg): 1.0
CONV1 activation sparsity(avg): 0.0
CONV2 activation sparsity(avg): 0.4925482948899269
CONV3 activation sparsity(avg): 0.7564612115383148
CONV4 activation sparsity(avg): 0.8869362964272499
CONV5 activation sparsity(avg): 0.8687332128226757
10000
