# Pruning

In [6]:
import os
import copy
import torch
from torch import nn
import torch.nn.utils.prune as prune
import torch.nn.functional as F

from indiv_utils import load_yaml, size_on_disk, get_layers, \
                        measure_inference_latency, param_count, \
                        FLOPs_count, save_model_weights, start_train, \
                        check_buffers, sparse_representation, Sparsity, \
                        global_unstructured_pruning, accuracy
from models import FFNN
from data_processing import MNISTDataProcessor

## Preliminaries & Setup

| hyperparameter  | MNIST |
| --------------- | ----- |
| learning rate   | 0.001 |
| batch size      | 64    |
| hidden size     | 1024  |
| # hidden layers | 2     |
| input size      | 20x20 |
| output size     | 10    |

In [7]:
"""Setup"""
# hyperparameters
lr = 0.001
batch_size = 64
num_hidden = 2
hidden_dim = 1024
out_dim = 10 # 10 MNIST classes   
epochs = 2
input_dim = 20*20

# config
config = load_yaml('config')

# device 
device = torch.device(config['device'])

# criterion
criterion = torch.nn.CrossEntropyLoss()

# model
model = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)

In [8]:
"""Model initialization:
Before training, SAVE the model's initial (random) weights. 
You will use them later for iterative pruning."""

if os.path.exists("out/FFNN_weights_initial.pth"):
    pass
else:
    initial_random_weights = save_model_weights(model, fname="initial")

In [9]:
"""Train"""
if os.path.exists("out/FFNN_weights_trained.pth"):
    pass
else:
    start_train(model, device, criterion, epochs, batch_size, lr)
    trained_weights = save_model_weights(model, fname="trained")

In [10]:
"""load initial model"""
model_initial = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
model_initial.load_state_dict(torch.load("out/FFNN_weights_initial.pth"))
print("Loaded initial model weights")

"""load trained model"""
model_trained = copy.deepcopy(model_initial)
model_trained.load_state_dict(torch.load("out/FFNN_weights_trained.pth"))
print("Loaded trained model weights")

"""test dataset"""
test_dataset = MNISTDataProcessor().vision_test_dataset()

Loaded initial model weights
Loaded trained model weights
Center Cropping images from 28x28 to 20x20
new image size:  (400,)
Center Cropping images from 28x28 to 20x20
new image size:  (400,)
parsing test features...
The number of test labels: 10000


In [11]:
"""Inference Latency of Trained Model"""
# Inference Latency
measure_inference_latency(model=model_trained, test_dataset=test_dataset, device=device, warmup_itr=100)

Measuring inference latency of trained FFNN on cuda...


1it [00:00,  6.01it/s]

Warm-up begins...


10000it [00:03, 3264.81it/s]


Mean inference latency: 0.127ms


In [12]:
"""Parameter Count, FLOPs, and Disk Storage of Trained Model"""
# Parameter Count
param_count(model=model_trained)

# Disk Storage
size_on_disk(model=model_trained)

# Accuracy
accuracy(model=model_trained, test_dataset=test_dataset, device=device)

Total Parmeter Count in FFNN: 1470474
	model.2.bias:	10
	model.0.0.bias:	1024
	model.1.0.bias:	1024
	model.2.weight:	10240
	model.0.0.weight:	409600
	model.1.0.weight:	1048576
Model Size on Disk: 5.883903 MB
Accuracy: 97.430%


## Magnitude pruning on SST2/MNIST

### Trained Model

In [13]:
"""Trained Model's architecture"""
print(model_trained)

FFNN(
  (model): Sequential(
    (0): Sequential(
      (0): Linear(in_features=400, out_features=1024, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=1024, out_features=1024, bias=True)
      (1): ReLU()
    )
    (2): Linear(in_features=1024, out_features=10, bias=True)
  )
)


In [14]:
"""Buffers of Trained Model"""
print(list(model_trained.named_buffers()))

[]


In [15]:
"""Get all layers of Trained Model"""
model_layers = get_layers(model=model_trained)
print(f"{model_trained.__class__.__name__} layers:\n{model_layers}")

FFNN layers:
ModuleList(
  (0): Linear(in_features=400, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=1024, bias=True)
  (3): ReLU()
  (4): Linear(in_features=1024, out_features=10, bias=True)
)


In [16]:
"""Weight and Bias of the first layer of Trained Model"""
print(f"Weight of the first layer:\n{model_layers[0].weight}")
print(f"Bias of the first layer:\n{model_layers[0].bias}")

Weight of the first layer:
Parameter containing:
tensor([[ 0.0661,  0.0115,  0.0095,  ...,  0.0792,  0.0295,  0.0438],
        [ 0.0417,  0.0243,  0.0168,  ...,  0.0155,  0.0438,  0.0155],
        [ 0.0864,  0.0559,  0.0999,  ...,  0.0503,  0.0688,  0.0363],
        ...,
        [-0.0159,  0.0330, -0.0122,  ...,  0.0150,  0.0357, -0.0115],
        [-0.0207,  0.0613, -0.0179,  ..., -0.0278,  0.0120, -0.0376],
        [ 0.0525,  0.0021,  0.0082,  ..., -0.0028,  0.0136,  0.0405]],
       device='cuda:0', requires_grad=True)
Bias of the first layer:
Parameter containing:
tensor([ 0.0228, -0.0110, -0.0333,  ..., -0.0355, -0.0043, -0.0440],
       device='cuda:0', requires_grad=True)


### Global Unstructured Magnitude (L1) Pruning 

In [17]:
"""Sparsity of Trained Model"""
Sparsity(model=model_trained).global_level()

Global sparsity: 0.0%


In [18]:
"""Sparsity of each layer of Trained Model"""
Sparsity(model=model_trained).each_layer()

Sparsity of Linear: 0.0%
Sparsity of Linear: 0.0%
Sparsity of Linear: 0.0%


In [19]:
"""Global Unstructured Pruning"""
sparsity_level = 0.33
global_unstructured_pruning(model=model_trained, sparsity_level=sparsity_level)

In [20]:
"""Sparsity of pruned Trained Model"""
Sparsity(model=model_trained).global_level()

Global sparsity: 32.99998093183403%


In [21]:
"""Sparsity of each layer of pruned Trained Model"""
Sparsity(model=model_trained).each_layer()

Sparsity of Linear: 26.169677734375%
Sparsity of Linear: 35.65034866333008%
Sparsity of Linear: 34.814453125%


In [22]:
"""Check disk size of pruned (before removing mask buffers) Trained Model.
You will notice that the model size on disk is doubled after pruning.
This is because mask buffers are stored in addition to the original parameters."""
size_on_disk(model=model_trained)

"""Buffers in pruned (before removing mask buffers) Trained Model"""
check_buffers(model=model_trained)

Model Size on Disk: 11.75825 MB
Number of buffers in FFNN: 3
Buffers in FFNN:
[('model.0.0.weight_mask', tensor([[1., 0., 0.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 0.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 1., 0., 1.],
        [1., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')), ('model.1.0.weight_mask', tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 0., 1.,  ..., 1., 1., 1.],
        [0., 0., 1.,  ..., 1., 0., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 1.],
        [0., 1., 1.,  ..., 0., 1., 1.],
        [1., 1., 0.,  ..., 0., 1., 1.]], device='cuda:0')), ('model.2.weight_mask', tensor([[0., 0., 0.,  ..., 0., 1., 1.],
        [1., 0., 0.,  ..., 1., 1., 1.],
        [1., 0., 1.,  ..., 0., 0., 1.],
        ...,
        [1., 1., 0.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 0., 0.]], device='cuda:0'))]


In [23]:
"""State Dict of Spare Representation of pruned Trained Model"""
sd = sparse_representation(model=model_trained)

"""Buffers in pruned (after removing mask buffers) Trained Model"""
check_buffers(model=model_trained)

sparsifying model.0.0.weight
sparsifying model.1.0.weight
sparsifying model.2.weight
Number of buffers in FFNN: 0
Buffers in FFNN:
[]


In [24]:
"""Save the state dict of Sparse Representation of pruned Trained Model"""
torch.save(sd, "out/FFNN_weights_pruned.pth")

"""Notice that the size of this state dict (weights) of pruned Trained Model is quite large.
This is because this state dict contains not only tensors as dictionary values but also indices, layouts, etc."""
print(f'{os.path.getsize("out/FFNN_weights_pruned.pth")/1e6} MB')

19.705641 MB


In [25]:
# load pruned model
sd = torch.load("out/FFNN_weights_pruned.pth")
model_pruned = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
model_pruned.load_state_dict({k:(v if v.layout == torch.strided else v.to_dense()) for k,v in sd.items()})
Sparsity(model=model_pruned).each_layer()

Sparsity of Linear: 26.169677734375%
Sparsity of Linear: 35.65034866333008%
Sparsity of Linear: 34.814453125%


In [26]:
"""Latency of pruned Trained Model"""
measure_inference_latency(model=model_pruned, test_dataset=test_dataset, device=device, warmup_itr=100)

Measuring inference latency of trained FFNN on cuda...


0it [00:00, ?it/s]

Warm-up begins...


10000it [00:02, 3586.85it/s]


Mean inference latency: 0.123ms


In [27]:
"""Parameter Count and Disk Storage of Trained Model"""
# Parameter Count
param_count(model=model_pruned)

# Disk Storage
size_on_disk(sd, fname="FFNN_weights_pruned.pth")

# Accuracy
accuracy(model=model_pruned, test_dataset=test_dataset, device=device)

Total Parmeter Count in FFNN: 1470474
	model.2.bias:	10
	model.0.0.bias:	1024
	model.1.0.bias:	1024
	model.2.weight:	10240
	model.0.0.weight:	409600
	model.1.0.weight:	1048576
Model Size on Disk: 19.705641 MB
Accuracy: 97.450%


### Repeated Global Unstructured Magnitude (L1) Pruning 

In [23]:
"""load trained moel"""
model_trained = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
model_trained.load_state_dict(torch.load("out/FFNN_weights_trained.pth"))

"""Repeated Global Unstructured Magnitude Pruning"""
for i in range(10):

    print(f"===================== Pruning {i+1} =========================")
    
    """Global Unstructured Pruning"""
    sparsity_level = 0.33
    global_unstructured_pruning(model=model_trained, sparsity_level=sparsity_level)

    """Sparsity of pruned Trained Model"""
    Sparsity(model_trained).each_layer()
    Sparsity(model_trained).global_level()
    
    # """State Dict of Spare Representation of pruned Trained Model"""
    # sd = sparse_representation(model=model_trained)

    # model_pruned = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
    # model_pruned.load_state_dict({k:(v if v.layout == torch.strided else v.to_dense()) for k,v in sd.items()})

    model_pruned = model_trained
    
    """Latency of pruned Trained Model"""
    measure_inference_latency(model=model_pruned, test_dataset=test_dataset, device=device, warmup_itr=100)

    # Accuracy
    accuracy(model=model_pruned, test_dataset=test_dataset, device=device)

Sparsity of Linear: 26.169677734375%
Sparsity of Linear: 35.65034866333008%
Sparsity of Linear: 34.814453125%
Global sparsity: 32.99998093183403%
Measuring inference latency of trained FFNN on cuda...


169it [00:00, 1687.11it/s]

Warm-up begins...


10000it [00:03, 2556.59it/s]


Mean inference latency: 0.210ms
Accuracy: 97.450%
Sparsity of Linear: 44.97900390625%
Sparsity of Linear: 59.017372131347656%
Sparsity of Linear: 60.234375%
Global sparsity: 55.10999607740586%
Measuring inference latency of trained FFNN on cuda...


204it [00:00, 2038.87it/s]

Warm-up begins...


10000it [00:03, 2553.00it/s]


Mean inference latency: 0.211ms
Accuracy: 97.420%
Sparsity of Linear: 59.163818359375%
Sparsity of Linear: 74.05357360839844%
Sparsity of Linear: 77.421875%
Global sparsity: 69.92371371600419%
Measuring inference latency of trained FFNN on cuda...


186it [00:00, 1855.34it/s]

Warm-up begins...


10000it [00:03, 2510.31it/s]


Mean inference latency: 0.223ms
Accuracy: 96.950%
Sparsity of Linear: 70.3134765625%
Sparsity of Linear: 83.50811004638672%
Sparsity of Linear: 86.5625%
Global sparsity: 79.84889840481172%
Measuring inference latency of trained FFNN on cuda...


165it [00:00, 1646.16it/s]

Warm-up begins...


10000it [00:03, 2539.27it/s]


Mean inference latency: 0.215ms
Accuracy: 95.780%
Sparsity of Linear: 78.751220703125%
Sparsity of Linear: 89.47248458862305%
Sparsity of Linear: 91.89453125%
Global sparsity: 86.49878508542538%
Measuring inference latency of trained FFNN on cuda...


168it [00:00, 1675.16it/s]

Warm-up begins...


10000it [00:03, 2543.02it/s]


Mean inference latency: 0.212ms
Accuracy: 91.760%
Sparsity of Linear: 84.992919921875%
Sparsity of Linear: 93.24560165405273%
Sparsity of Linear: 94.765625%
Global sparsity: 90.9541982653417%
Measuring inference latency of trained FFNN on cuda...


184it [00:00, 1834.28it/s]

Warm-up begins...


10000it [00:03, 2539.57it/s]


Mean inference latency: 0.214ms
Accuracy: 88.840%
Sparsity of Linear: 89.51513671875%
Sparsity of Linear: 95.63865661621094%
Sparsity of Linear: 96.89453125%
Global sparsity: 93.93931964783822%
Measuring inference latency of trained FFNN on cuda...


198it [00:00, 1974.79it/s]

Warm-up begins...


10000it [00:03, 2624.78it/s]


Mean inference latency: 0.206ms
Accuracy: 66.400%
Sparsity of Linear: 92.722412109375%
Sparsity of Linear: 97.17588424682617%
Sparsity of Linear: 97.998046875%
Global sparsity: 95.93936595624129%
Measuring inference latency of trained FFNN on cuda...


195it [00:00, 1947.34it/s]

Warm-up begins...


10000it [00:03, 2751.84it/s]


Mean inference latency: 0.205ms
Accuracy: 25.620%
Sparsity of Linear: 94.9443359375%
Sparsity of Linear: 98.17686080932617%
Sparsity of Linear: 98.779296875%
Global sparsity: 97.27938131973501%
Measuring inference latency of trained FFNN on cuda...


166it [00:00, 1656.07it/s]

Warm-up begins...


10000it [00:04, 2466.46it/s]


Mean inference latency: 0.222ms
Accuracy: 23.110%
Sparsity of Linear: 96.50244140625%
Sparsity of Linear: 98.82183074951172%
Sparsity of Linear: 99.16015625%
Global sparsity: 98.17721953451883%
Measuring inference latency of trained FFNN on cuda...


159it [00:00, 1588.70it/s]

Warm-up begins...


10000it [00:04, 2464.32it/s]


Mean inference latency: 0.222ms
Accuracy: 17.040%


In [24]:
"""Mimicking pruned model size on disk over iterative pruning.
This is a workaround since copy.deepcopy() does not work for PyTorch's pruned model."""
sparsity_level = 0.33
for i in range(10):
    print(f"===================== Pruning {i+1} =========================")
    
    """load trained moel"""
    model_trained = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
    model_trained.load_state_dict(torch.load("out/FFNN_weights_trained.pth"))
    
    """Global Unstructured Pruning"""
    global_unstructured_pruning(model=model_trained, sparsity_level=sparsity_level)
    sparsity_level = (1 - sparsity_level) * 0.33 + sparsity_level

    """Sparsity of pruned Trained Model"""
    Sparsity(model_trained).each_layer()
    Sparsity(model_trained).global_level()
    
    """State Dict of Spare Representation of pruned Trained Model"""
    sd = sparse_representation(model=model_trained)

    torch.save(sd, "out/FFNN_weights_pruned.pth")
    
    # Disk Storage
    size_on_disk(sd, fname="FFNN_weights_pruned.pth")

Sparsity of Linear: 26.169677734375%
Sparsity of Linear: 35.65034866333008%
Sparsity of Linear: 34.814453125%
Global sparsity: 32.99998093183403%
sparsifying model.0.0.weight
sparsifying model.1.0.weight
sparsifying model.2.weight
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 44.97900390625%
Sparsity of Linear: 59.017372131347656%
Sparsity of Linear: 60.234375%
Global sparsity: 55.10999607740586%
sparsifying model.0.0.weight
sparsifying model.1.0.weight
sparsifying model.2.weight
Model Size on Disk: 13.212329 MB
Sparsity of Linear: 59.163818359375%
Sparsity of Linear: 74.05357360839844%
Sparsity of Linear: 77.421875%
Global sparsity: 69.92371371600419%
sparsifying model.0.0.weight
sparsifying model.1.0.weight
sparsifying model.2.weight
Model Size on Disk: 8.861865 MB
Sparsity of Linear: 70.3134765625%
Sparsity of Linear: 83.50811004638672%
Sparsity of Linear: 86.5625%
Global sparsity: 79.84889840481172%
sparsifying model.0.0.weight
sparsifying model.1.0.weight
sparsifying model.

### Iterative magnitude pruning (IMP)

In [28]:
"""load trained moel"""
model_trained = FFNN(input_dim=input_dim, hidden_dim=hidden_dim, out_dim=out_dim, num_hidden=num_hidden, bias=True).to(device)
model_trained.load_state_dict(torch.load("out/FFNN_weights_trained.pth"))

"""Repeated Global Unstructured Magnitude Pruning"""
for i in range(10):

    print(f"===================== Pruning {i+1} =========================")
    
    """Global Unstructured Pruning"""
    sparsity_level = 0.33
    global_unstructured_pruning(model=model_trained, sparsity_level=sparsity_level)

    """Sparsity of pruned Trained Model"""
    Sparsity(model_trained).each_layer()
    Sparsity(model_trained).global_level()

    init_weights = torch.load("out/FFNN_weights_initial.pth")
    prune_param_list = [(model_trained.model[0][0], 'weight'),
                        (model_trained.model[1][0], 'weight'),
                        (model_trained.model[2], 'weight')]
    init_updated = {k + ("_orig" if "weight" in k else ""):v for k,v in init_weights.items()}
    model_trained_sd = model_trained.state_dict()
    model_trained_sd.update(init_updated)
    model_trained.load_state_dict(model_trained_sd)

    model_pruned = model_trained
    
    """Latency of pruned Trained Model"""
    measure_inference_latency(model=model_pruned, test_dataset=test_dataset, device=device, warmup_itr=100)

    # Accuracy
    accuracy(model=model_pruned, test_dataset=test_dataset, device=device)

    # Model Size on Disk
    size_on_disk(model_pruned.state_dict(), fname="FFNN_weights_pruned.pth")

Sparsity of Linear: 26.169677734375%
Sparsity of Linear: 35.65034866333008%
Sparsity of Linear: 34.814453125%
Global sparsity: 32.99998093183403%
Measuring inference latency of trained FFNN on cuda...


179it [00:00, 1785.92it/s]

Warm-up begins...


10000it [00:03, 2801.96it/s]


Mean inference latency: 0.188ms
Accuracy: 83.570%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 43.27001953125%
Sparsity of Linear: 59.73625183105469%
Sparsity of Linear: 54.98046875%
Global sparsity: 55.10999607740586%
Measuring inference latency of trained FFNN on cuda...


192it [00:00, 1915.25it/s]

Warm-up begins...


10000it [00:03, 2727.04it/s]


Mean inference latency: 0.197ms
Accuracy: 83.950%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 53.95361328125%
Sparsity of Linear: 76.13353729248047%
Sparsity of Linear: 72.841796875%
Global sparsity: 69.92371371600419%
Measuring inference latency of trained FFNN on cuda...


191it [00:00, 1908.66it/s]

Warm-up begins...


10000it [00:03, 2675.23it/s]


Mean inference latency: 0.201ms
Accuracy: 81.190%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 60.810546875%
Sparsity of Linear: 87.2304916381836%
Sparsity of Linear: 85.5078125%
Global sparsity: 79.84889840481172%
Measuring inference latency of trained FFNN on cuda...


193it [00:00, 1928.40it/s]

Warm-up begins...


10000it [00:03, 2581.80it/s]


Mean inference latency: 0.210ms
Accuracy: 39.320%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 65.337646484375%
Sparsity of Linear: 94.69451904296875%
Sparsity of Linear: 93.701171875%
Global sparsity: 86.49878508542538%
Measuring inference latency of trained FFNN on cuda...


177it [00:00, 1769.91it/s]

Warm-up begins...


10000it [00:03, 2507.37it/s]


Mean inference latency: 0.220ms
Accuracy: 16.620%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 68.34375%
Sparsity of Linear: 99.7018814086914%
Sparsity of Linear: 99.609375%
Global sparsity: 90.9541982653417%
Measuring inference latency of trained FFNN on cuda...


194it [00:00, 1939.17it/s]

Warm-up begins...


10000it [00:03, 2601.04it/s]


Mean inference latency: 0.215ms
Accuracy: 9.750%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 78.2724609375%
Sparsity of Linear: 100.0%
Sparsity of Linear: 100.0%
Global sparsity: 93.93931964783822%
Measuring inference latency of trained FFNN on cuda...


166it [00:00, 1659.68it/s]

Warm-up begins...


10000it [00:04, 2469.85it/s]


Mean inference latency: 0.226ms
Accuracy: 9.580%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 85.442626953125%
Sparsity of Linear: 100.0%
Sparsity of Linear: 100.0%
Global sparsity: 95.93936595624129%
Measuring inference latency of trained FFNN on cuda...


165it [00:00, 1648.64it/s]

Warm-up begins...


10000it [00:03, 2704.87it/s]


Mean inference latency: 0.206ms
Accuracy: 9.580%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 90.24658203125%
Sparsity of Linear: 100.0%
Sparsity of Linear: 100.0%
Global sparsity: 97.27938131973501%
Measuring inference latency of trained FFNN on cuda...


198it [00:00, 1973.19it/s]

Warm-up begins...


10000it [00:03, 2729.18it/s]


Mean inference latency: 0.204ms
Accuracy: 9.580%
Model Size on Disk: 19.705641 MB
Sparsity of Linear: 93.46533203125%
Sparsity of Linear: 100.0%
Sparsity of Linear: 100.0%
Global sparsity: 98.17721953451883%
Measuring inference latency of trained FFNN on cuda...


186it [00:00, 1858.27it/s]

Warm-up begins...


10000it [00:03, 2650.15it/s]


Mean inference latency: 0.209ms
Accuracy: 9.580%
Model Size on Disk: 19.705641 MB
