In [1]:
import torch
from torch import nn

# Note: Pytorch 1.10.0 or latexr

torch.__version__

#SEt-up device-agnostic code
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
from google.colab import drive # Mount my google drive diretory
drive.mount("/content/gdrive",force_remount=True)
import os # Change directory to CS284 folder in google drive
os.chdir("gdrive/MyDrive/Pruning_Study")

Mounted at /content/gdrive


##Dataset download an preparation

Downloads CIFAR-10 dataset. In this experiment, dataset is resized to 224x224 to match input size of the MobileNet V1 implementation.

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms


import torch
import torchvision
import torchvision.transforms as transforms


#Transform CIFAR10 to 224x224 with augmentation
transform_train = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])


transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

trainset = torchvision.datasets.CIFAR10(root='./CIFAR10', train=True,
                                        download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./CIFAR10', train=False,
                                       download=True, transform=transform_test)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
#Batch size and dataloader setting
batch_size = 64
train_dataloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

In [None]:
len(train_dataloader),len(test_dataloader)

(782, 157)

In [None]:
img,label=next(iter(train_dataloader))

img.shape,label[0]

(torch.Size([64, 3, 224, 224]), tensor(9))

## Full Model

MobileNet V1 implementation is based from here: https://medium.com/@karuneshu21/implement-mobilenet-v1-in-pytorch-fd03a6618321

The implementation is modified to only have 10 classes as we are only using CIFAR-10 Dataset

In [None]:
import torch
# all nn libraries nn.layer, convs and loss functions

import torch.nn as nn
# Display Image

from IPython.display import Image

# visualisation
!pip install torchview
import torchvision
from torchview import draw_graph

# !pip install transformers
#from transformers import MobileNetV1Config, MobileNetV1Model

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Collecting torchview
  Downloading torchview-0.2.6-py3-none-any.whl (25 kB)
Installing collected packages: torchview
Successfully installed torchview-0.2.6
cuda


In [None]:
#Depthwise example

import torch
import torch.nn as nn

# Define a simple depthwise convolutional layer
class DepthwiseConv2d(nn.Module):
    def __init__(self, in_channels, kernel_size, stride=1, padding=0):
        super(DepthwiseConv2d, self).__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, padding=padding, groups=in_channels)

    def forward(self, x):
        x = self.depthwise(x)
        return x

# Example usage
# Create a random input tensor
input_tensor = torch.randn(1, 3, 32, 32)  # (batch_size, channels, height, width)

# Define a depthwise convolutional layer
depthwise_conv = DepthwiseConv2d(in_channels=3, kernel_size=3, stride=1, padding=1)

# Apply the depthwise convolution to the input tensor
output_tensor = depthwise_conv(input_tensor)

# Print the shape of the output tensor
print("Output tensor shape:", output_tensor.shape)

Output tensor shape: torch.Size([1, 3, 32, 32])


In [None]:
import torch
import torch.nn as nn

# Define a simple pointwise convolutional layer
class PointwiseConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
        super(PointwiseConv2d, self).__init__()
        self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        x = self.pointwise(x)
        return x

# Example usage
# Create a random input tensor
input_tensor = torch.randn(1, 3, 32, 32)  # (batch_size, channels, height, width)

# Define a pointwise convolutional layer
pointwise_conv = PointwiseConv2d(in_channels=3, out_channels=64)

# Apply the pointwise convolution to the input tensor
output_tensor = pointwise_conv(input_tensor)

# Print the shape of the output tensor
print("Output tensor shape:", output_tensor.shape)

Output tensor shape: torch.Size([1, 64, 32, 32])


In [None]:
class DepthWiseSeperable(nn.Module):

    def __init__(self, in_channels , out_channels , stride ):
        """
        DepthWiseSeperable block of MobileNet which performs the following operations:
        (a) depthwise convolution by applying a separate filter for each channel
        (b) pointwise convolutions are applied which combine the filtered result by implementing 1 × 1 convolution

            Note:
                1. groups = in_channels used for depthwise convolution
                2. in_channels and out_channels are same for depthwise convolution
                3. bias = False due to the usage of BatchNorm
                4. To generate same height and width of output feature map as the input feature map, following should be padding for
                    * 1x1 conv : p=0
                    * 3x3 conv : p=1
                    * 5x5 conv : p=2


        Args:
          in_channels (int) : number of input channels
          out_channels (int) : number of output channels
          stride (int) : stride used for depthwise convolution

        Attributes:
            Depthwise seperable convolutional block

        """

        super(DepthWiseSeperable,self).__init__()

        # groups used here
        self.depthwise = nn.Conv2d(in_channels = in_channels , out_channels = in_channels , stride = stride , padding = 1, kernel_size = 3 , groups=in_channels , bias = False)
        self.bn1 = nn.BatchNorm2d(in_channels)

        self.pointwise = nn.Conv2d(in_channels = in_channels , out_channels = out_channels , stride = 1 , padding = 0, kernel_size = 1, bias = False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.relu = nn.ReLU()

    def forward(self,x):

        x = self.depthwise(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.pointwise(x)
        x = self.bn2(x)
        x = self.relu(x)

        return x


def test_DepthWiseSeperable():
    x = torch.randn(1,32,112,112)
    model = DepthWiseSeperable(32,64,2)
    print(model(x).shape)

    return model


#model = test_DepthWiseSeperable()
#architecture = 'DepthWiseSeperable'
#model_graph = draw_graph(model, input_size=(1,32,112,112), graph_dir ='TB' , roll=True, expand_nested=True, graph_name=f'self_{architecture}',save_graph=True,filename=f'self_{architecture}')
#model_graph.visual_graph

# output
"""
torch.Size([1, 64, 56, 56])
"""

'\ntorch.Size([1, 64, 56, 56])\n'

In [None]:
test=img
test.shape

torch.Size([64, 3, 224, 224])

In [None]:
class MobileNetV1(nn.Module):

    def __init__(self, num_classes=10):

        super(MobileNetV1, self).__init__()

        # Initial convolution layer
        self.features1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias = False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(32),
        )

        # Depthwise separable convolutions
        self.features2 = DepthWiseSeperable(32, 64, 1)
        self.features3 = DepthWiseSeperable(64, 128, 2)
        self.features4 = DepthWiseSeperable(128, 128, 1)
        self.features5 = DepthWiseSeperable(128, 256, 2)

        self.features6 = DepthWiseSeperable(256, 256, 1)
        self.features7 = DepthWiseSeperable(256, 512, 2)
        self.features8 = DepthWiseSeperable(512, 512, 1)
        self.features9 = DepthWiseSeperable(512, 512, 1)
        self.features10 = DepthWiseSeperable(512, 512, 1)
        self.features11 = DepthWiseSeperable(512, 512, 1)
        self.features12 = DepthWiseSeperable(512, 512, 1)
        self.features13 = DepthWiseSeperable(512, 1024, 2)
        self.features14 = DepthWiseSeperable(1024, 1024, 1)


        # Average pooling and classifier
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Linear(1024, num_classes),
        )

    def forward(self, x):
        x=  self.features1(x)
        x = self.features2(x)
        x = self.features3(x)
        x = self.features4(x)
        x = self.features5(x)
        x = self.features6(x)
        x = self.features7(x)
        x = self.features8(x)
        x = self.features9(x)
        x = self.features10(x)
        x = self.features11(x)
        x = self.features12(x)
        x = self.features13(x)
        x = self.features14(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# Create an instance of MobileNetV1
model = MobileNetV1()
# print(model)


def test_Mobilenet():
    x = torch.randn(1,3,224,224)
    model = MobileNetV1()
    print(model(x).shape)

    return model


#model = test_Mobilenet()
#architecture = 'mobilenetv1'
#model_graph = draw_graph(model, input_size=(1,3,64,64), graph_dir ='TB' , roll=True, expand_nested=True, graph_name=f'self_{architecture}',save_graph=True,filename=f'self_{architecture}')
# model_graph.visual_graph

In [None]:
test=img
test.shape
model_big=MobileNetV1()
test_input=test
test_input.shape
test_model=model_big(test_input)
test_model.shape
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters: ", total_params)

Total number of parameters:  3217226


In [None]:
# Import time it for training time checking and printing of total time
from timeit import default_timer as timer
def print_train_time(start:float,end:float,device:torch.device=None):
  total_time=end-start
  print(f"Train time on {device}: {total_time:.3f} seconds")
  return total_time

def print_test_time(start:float,end:float,device:torch.device=None):
  total_time=end-start
  print(f"Test time on {device}: {total_time:.3f} seconds")
  return total_time

# Function made for accuracy checking
def accuracy_fn(y_target,y_pred):

    correct = (y_target==y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc


In [None]:
#Training from scratch
from torch.optim.lr_scheduler import CosineAnnealingLR

device="cuda" if torch.cuda.is_available() else "cpu" #device
loss_fn=nn.CrossEntropyLoss()#Loss Function

model_big=MobileNetV1()
optimizer1=torch.optim.Adam(params=model_big.parameters(),lr=0.0005,weight_decay=0.001) #Optimizer


lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)
sigmoid=nn.Sigmoid()

In [None]:
from torch.utils.data import DataLoader

In [None]:
# Training and Testing Function
def training(model:torch.nn.Module,data_loader:DataLoader,loss_fn:torch.nn.Module,optimizer:torch.optim.Optimizer, accuracy_fn,device:torch.device=device):
  train_loss,train_acc=0,0
  model.to(device)
  model.train()

  for batch,(X,y) in enumerate(data_loader):
    X,y=X.float().to(device),y.to(device)

    y_pred=model(X)

    loss=loss_fn(y_pred,y)
    y_pred_class=torch.argmax(torch.softmax(y_pred,dim=1),dim=1)
    train_loss=train_loss+loss

    train_acc=train_acc+accuracy_fn(y_target=y,y_pred=y_pred_class)
    optimizer.zero_grad()

    loss.backward()

    optimizer.step()
  lr_scheduler.step()

  train_loss/=len(data_loader)
  train_acc/=len(data_loader)
  print(f"Train Loss:{train_loss:5f}|Train Acc:{train_acc:5f}%")
  return train_loss,train_acc


def testing(data_loader:DataLoader,model:torch.nn.Module,loss_fn:torch.nn.Module,accuracy_fn,device:torch.device=device):
  test_loss,test_acc=0,0
  model.to(device)
  model.eval()

  with torch.inference_mode():
    for X,y in data_loader:
      X,y=X.float().to(device),y.to(device)

      test_pred=model(X)

      test_loss=test_loss+loss_fn(test_pred,y)
      y_pred_class=torch.argmax(torch.softmax(test_pred,dim=1),dim=1)
      test_acc=test_acc+accuracy_fn(y_target=y,y_pred=y_pred_class)

    test_loss/=len(data_loader)
    test_acc/=len(data_loader)

    print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}%\n")

    return test_loss,test_acc

In [None]:

##Training and Testing Loop


#Training Loop

from tqdm.auto import tqdm
import torch

train_loss_list=[]
train_acc_list=[]
test_loss_list=[]
test_acc_list=[]

epochs=50 #epochs
best_acc=0
patience=70
max_patience_after_revert = 30
current_patience=0

from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="best_model_big.pth" #modelname
model_save_path=model_path/model_name



# Create model save


reverted=False
start_timer=timer()
for epoch in tqdm(range(epochs)):
  print(f"Epoch:{epoch}")
  train_loss,train_acc=training(data_loader=train_dataloader,model=model_big,loss_fn=loss_fn,optimizer=optimizer1,accuracy_fn=accuracy_fn)
  test_loss,test_acc=testing(data_loader=test_dataloader,model=model_big,loss_fn=loss_fn,accuracy_fn=accuracy_fn)


  if (test_acc>best_acc):
    best_acc=test_acc
    model_big.state_dict()
    best_model_state_dict=model.state_dict()
    torch.save(model_big.state_dict(),f=model_save_path)
    print(f"New best test acc is {test_acc}. Model Saved")
    current_patience=0 #reset patience
    patience_after_revert=0
    reverted=False
  else:
    current_patience+=1

    if current_patience>=patience and not reverted:
      print(f"No improvement for{patience} epochs. Reverting to the best model")
      model.load_state_dict(best_model_state_dict)
      patience_after_revert=0
      reverted=True

  if reverted:
    patience_after_revert+=1
    if patience_after_revert>=max_patience_after_revert:
      print(f"Continue for {max_patience_after_revert} epochs after revert")

      if test_acc>=best_acc:
       print("Test loss still not improving reverting to best model")
       model.load_state_dict(best_model_state_dict)
      patience_after_revert=0


from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="last_model_big.pth" #modelname
model_save_path=model_path/model_name
# Create model save


torch.save(model_big.state_dict(),f=model_save_path)
print(f"Last model is saved with test loss:{test_acc}")
model_big.state_dict()

end_timer=timer()
print_train_time(start=start_timer,end=end_timer,device=device)


  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss:1.382474|Train Acc:49.514466%
Test loss: 1.06938 | Test accuracy: 61.84%

Epoch:0
New best test acc is 61.843152866242036. Model Saved
Train Loss:0.958583|Train Acc:65.956682%
Test loss: 0.89568 | Test accuracy: 69.07%

Epoch:1
New best test acc is 69.06847133757962. Model Saved
Train Loss:0.748709|Train Acc:74.162804%
Test loss: 0.74090 | Test accuracy: 74.45%

Epoch:2
New best test acc is 74.45262738853503. Model Saved
Train Loss:0.640238|Train Acc:78.041081%
Test loss: 0.61733 | Test accuracy: 79.03%

Epoch:3
New best test acc is 79.03065286624204. Model Saved
Train Loss:0.568183|Train Acc:80.600623%
Test loss: 0.58644 | Test accuracy: 80.00%

Epoch:4
New best test acc is 79.99601910828025. Model Saved
Train Loss:0.514935|Train Acc:82.498801%
Test loss: 0.53230 | Test accuracy: 81.94%

Epoch:5
New best test acc is 81.93670382165605. Model Saved
Train Loss:0.475566|Train Acc:83.773577%
Test loss: 0.48668 | Test accuracy: 83.58%

Epoch:6
New best test acc is 83.578821656050

5028.270874894

In [None]:
#Loading trained model
def testing(data_loader:DataLoader,model:torch.nn.Module,loss_fn:torch.nn.Module,accuracy_fn,device:torch.device=device):
  test_loss,test_acc=0,0
  model.to(device)
  model.eval()

  with torch.inference_mode():
    for X,y in data_loader:
      X,y=X.float().to(device),y.to(device)

      test_pred=model(X)

      test_loss=test_loss+loss_fn(test_pred,y)
      y_pred_class=torch.argmax(torch.softmax(test_pred,dim=1),dim=1)
      test_acc=test_acc+accuracy_fn(y_target=y,y_pred=y_pred_class)

    test_loss/=len(data_loader)
    test_acc/=len(data_loader)

    print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}%\n")

    return test_loss,test_acc

# This is when conitnuing training of a previously saved model
from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="best_model_big_07052024.pth" #modelname
model_save_path=model_path/model_name

#Load testing
sigmoid=nn.Sigmoid()
loss_fn=nn.CrossEntropyLoss()#Loss Function



model_big=MobileNetV1()
model_big.load_state_dict(torch.load(f=model_save_path))
#model_load.state_dict()
optimizer1=torch.optim.Adam(params=model_big.parameters(),lr=0.0005,weight_decay=0.000) #Optimizer


lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)
start_timer=timer()
test_loss,test_acc=testing(data_loader=test_dataloader,model=model_big,loss_fn=loss_fn,accuracy_fn=accuracy_fn)

best_loss=test_loss
best_loss
end_timer=timer()
print_test_time(start=start_timer,end=end_timer,device=device)

Test loss: 0.30847 | Test accuracy: 89.73%

Test time on cuda: 9.679 seconds


9.67883318500003

In [None]:
import torch
import torch.nn as nn

def get_scaling_factors(network):
    scaling_factors = []
    for module in network.modules():
        if isinstance(module, nn.BatchNorm2d) or isinstance(module, nn.BatchNorm1d):
            scaling_factors.append(module.weight)
    return scaling_factors
scaling_factors=get_scaling_factors(model_big)
scaling_factors[2],max(scaling_factors[2]),min(scaling_factors[2])


## Pruned

The Network is pruned. 5 blocks are removed and are replaced with 2 smaller blocks.

In [None]:
class MobileNetV1_prunedblock6(nn.Module):

    def __init__(self, num_classes=10):

        super(MobileNetV1_prunedblock6, self).__init__()

        # Initial convolution layer
        self.features1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias = False),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(32),
        )

        # Depthwise separable convolutions
        self.features2 = DepthWiseSeperable(32, 64, 1)
        self.features3 = DepthWiseSeperable(64, 128, 2)
        self.features4 = DepthWiseSeperable(128, 128, 1)
        self.features5 = DepthWiseSeperable(128, 256, 2)

        self.features6 = DepthWiseSeperable(256, 256, 1)
        self.features7 = DepthWiseSeperable(256, 512, 2)
        self.features8 = DepthWiseSeperable(512, 64, 1)
        self.features9 = DepthWiseSeperable(64, 32, 1)
        ##self.features10 = DepthWiseSeperable(512, 512, 1)
        ##self.features11 = DepthWiseSeperable(512, 512, 1)
        ##self.features12 = DepthWiseSeperable(512, 512, 1)
        ##self.features13 = DepthWiseSeperable(512, 1024, 2)
        ##self.features14 = DepthWiseSeperable(1024, 1024, 1)


        # Average pooling and classifier
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Linear(32, num_classes),
        )

    def forward(self, x):
        x=  self.features1(x)
        x = self.features2(x)
        x = self.features3(x)
        x = self.features4(x)
        x = self.features5(x)
        x = self.features6(x)
        x = self.features7(x)
        x = self.features8(x)
        x = self.features9(x)
        #x = self.features10(x)
        #x = self.features11(x)
        #x = self.features12(x)
        #x = self.features13(x)
        #x = self.features14(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# Create an instance of MobileNetV1
model = MobileNetV1()
# print(model)


def test_Mobilenet():
    x = torch.randn(1,3,224,224)
    model = MobileNetV1()
    print(model(x).shape)

    return model


#model = test_Mobilenet()
#architecture = 'mobilenetv1'
#model_graph = draw_graph(model, input_size=(1,3,64,64), graph_dir ='TB' , roll=True, expand_nested=True, graph_name=f'self_{architecture}',save_graph=True,filename=f'self_{architecture}')
# model_graph.visual_graph

In [None]:
test=img
test.shape
model_small=MobileNetV1_prunedblock6()
test_input=test
test_input.shape
test_model=model_small(test_input)
test_model.shape

total_params = sum(p.numel() for p in model_small.parameters())
print("Total number of parameters: ", total_params)

Total number of parameters:  310794


In [None]:
!pip install torchinfo
from torchinfo import summary


Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
#Checking number of parameters of pruned network
summary(model_small, input_size=(1, 3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
MobileNetV1_prunedblock6                 [1, 10]                   --
├─Sequential: 1-1                        [1, 32, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─ReLU: 2-2                         [1, 32, 112, 112]         --
│    └─BatchNorm2d: 2-3                  [1, 32, 112, 112]         64
├─DepthWiseSeperable: 1-2                [1, 64, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         288
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         2,048
│    └─BatchNorm2d: 2-8                  [1, 64, 112, 112]         128
│    └─ReLU: 2-9                         [1, 64, 112, 112]         --
├─DepthWiseSeperable: 1-3                [1, 128, 56, 56]          --
│    └─Co

In [None]:
#Checking number of parameters of pruned network
summary(model_big, input_size=(1, 3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
MobileNetV1                              [1, 10]                   --
├─Sequential: 1-1                        [1, 32, 112, 112]         --
│    └─Conv2d: 2-1                       [1, 32, 112, 112]         864
│    └─ReLU: 2-2                         [1, 32, 112, 112]         --
│    └─BatchNorm2d: 2-3                  [1, 32, 112, 112]         64
├─DepthWiseSeperable: 1-2                [1, 64, 112, 112]         --
│    └─Conv2d: 2-4                       [1, 32, 112, 112]         288
│    └─BatchNorm2d: 2-5                  [1, 32, 112, 112]         64
│    └─ReLU: 2-6                         [1, 32, 112, 112]         --
│    └─Conv2d: 2-7                       [1, 64, 112, 112]         2,048
│    └─BatchNorm2d: 2-8                  [1, 64, 112, 112]         128
│    └─ReLU: 2-9                         [1, 64, 112, 112]         --
├─DepthWiseSeperable: 1-3                [1, 128, 56, 56]          --
│    └─Co

Pruned Network only have ~310k parameters while the original network have 3.2M parameters. Step below copies the weights of the original network on the Pruned Network

In [None]:

#Copying weights
model_small=MobileNetV1_prunedblock6()

features1_parameters = model_big.features1.state_dict()
features2_parameters = model_big.features2.state_dict()
features3_parameters = model_big.features3.state_dict()
features4_parameters = model_big.features4.state_dict()
features5_parameters = model_big.features5.state_dict()
features6_parameters = model_big.features6.state_dict()
features7_parameters = model_big.features7.state_dict()


model_small.features1.load_state_dict(features1_parameters)
model_small.features2.load_state_dict(features2_parameters)
model_small.features3.load_state_dict(features3_parameters)
model_small.features4.load_state_dict(features4_parameters)
model_small.features5.load_state_dict(features5_parameters)
model_small.features6.load_state_dict(features6_parameters)
model_small.features7.load_state_dict(features7_parameters)
for param in model_small.features1.parameters():
    param.requires_grad = True

for param in model_small.features2.parameters():
    param.requires_grad = True


loss_fn=nn.CrossEntropyLoss()#Loss Function


optimizer1=torch.optim.Adam(params=model_small.parameters(),lr=0.0005,weight_decay=0.001) #Optimizer


lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)

In [None]:
#Checking if parameter copy is successful
model_big.features2.state_dict()

OrderedDict([('depthwise.weight',
              tensor([[[[ 8.3048e-02, -2.4413e-01, -1.6181e-01],
                        [ 4.6391e-02, -5.8634e-03,  4.4460e-02],
                        [-1.7339e-02,  2.0203e-01,  2.4437e-01]]],
              
              
                      [[[ 9.2187e-02,  1.4106e-01, -1.6692e-02],
                        [ 1.3193e-01,  1.8768e-01, -9.0031e-02],
                        [-5.9208e-02, -5.4091e-02, -1.5911e-01]]],
              
              
                      [[[-5.5917e-02,  1.4010e-01,  1.7752e-01],
                        [ 4.4780e-02, -9.5942e-02, -1.0269e-01],
                        [-4.9953e-02, -1.4805e-01, -8.2106e-02]]],
              
              
                      [[[-6.2818e-02, -2.1594e-02,  8.0734e-02],
                        [-1.2458e-01,  6.2378e-02,  9.6277e-02],
                        [-1.1928e-01,  7.0790e-02,  5.8373e-02]]],
              
              
                      [[[ 8.7017e-02,  6.1055e-02,  1.2944

In [None]:
#Checking if parameter copy is successful
model_small.features2.state_dict()

OrderedDict([('depthwise.weight',
              tensor([[[[ 8.3048e-02, -2.4413e-01, -1.6181e-01],
                        [ 4.6391e-02, -5.8634e-03,  4.4460e-02],
                        [-1.7339e-02,  2.0203e-01,  2.4437e-01]]],
              
              
                      [[[ 9.2187e-02,  1.4106e-01, -1.6692e-02],
                        [ 1.3193e-01,  1.8768e-01, -9.0031e-02],
                        [-5.9208e-02, -5.4091e-02, -1.5911e-01]]],
              
              
                      [[[-5.5917e-02,  1.4010e-01,  1.7752e-01],
                        [ 4.4780e-02, -9.5942e-02, -1.0269e-01],
                        [-4.9953e-02, -1.4805e-01, -8.2106e-02]]],
              
              
                      [[[-6.2818e-02, -2.1594e-02,  8.0734e-02],
                        [-1.2458e-01,  6.2378e-02,  9.6277e-02],
                        [-1.1928e-01,  7.0790e-02,  5.8373e-02]]],
              
              
                      [[[ 8.7017e-02,  6.1055e-02,  1.2944

In [None]:
lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)
##Training and Testing Loop


#Training Loop

from tqdm.auto import tqdm
import torch

train_loss_list=[]
train_acc_list=[]
test_loss_list=[]
test_acc_list=[]

epochs=50 #epochs
best_acc=0
patience=70
max_patience_after_revert = 30
current_patience=0

from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="best_model_small.pth" #modelname
model_save_path=model_path/model_name



# Create model save


reverted=False
start_timer=timer()
for epoch in tqdm(range(epochs)):
  train_loss,train_acc=training(data_loader=train_dataloader,model=model_small,loss_fn=loss_fn,optimizer=optimizer1,accuracy_fn=accuracy_fn)
  test_loss,test_acc=testing(data_loader=test_dataloader,model=model_small,loss_fn=loss_fn,accuracy_fn=accuracy_fn)
  print(f"Epoch:{epoch}")

  if (test_acc>best_acc):
    best_acc=test_acc
    model_small.state_dict()
    best_model_state_dict=model.state_dict()
    torch.save(model_small.state_dict(),f=model_save_path)
    print(f"New best test loss is {test_acc}. Model Saved")
    current_patience=0 #reset patience
    patience_after_revert=0
    reverted=False
  else:
    current_patience+=1

    if current_patience>=patience and not reverted:
      print(f"No improvement for{patience} epochs. Reverting to the best model")
      model.load_state_dict(best_model_state_dict)
      patience_after_revert=0
      reverted=True

  if reverted:
    patience_after_revert+=1
    if patience_after_revert>=max_patience_after_revert:
      print(f"Continue for {max_patience_after_revert} epochs after revert")

      if test_acc>=best_acc:
       print("Test loss still not improving reverting to best model")
       model.load_state_dict(best_model_state_dict)
      patience_after_revert=0


from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="last_model_small.pth" #modelname
model_save_path=model_path/model_name
# Create model save


torch.save(model_small.state_dict(),f=model_save_path)
print(f"Last model is saved with test loss:{test_acc}")
model_small.state_dict()

end_timer=timer()
print_train_time(start=start_timer,end=end_timer,device=device)


  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss:1.283506|Train Acc:59.528852%
Test loss: 0.90480 | Test accuracy: 70.73%

Epoch:0
New best test loss is 70.73049363057325. Model Saved
Train Loss:0.772497|Train Acc:74.984015%
Test loss: 0.72587 | Test accuracy: 75.43%

Epoch:1
New best test loss is 75.42794585987261. Model Saved
Train Loss:0.617066|Train Acc:79.729460%
Test loss: 0.60499 | Test accuracy: 79.85%

Epoch:2
New best test loss is 79.8467356687898. Model Saved
Train Loss:0.535325|Train Acc:82.348945%
Test loss: 0.56171 | Test accuracy: 80.49%

Epoch:3
New best test loss is 80.49363057324841. Model Saved
Train Loss:0.482734|Train Acc:84.089274%
Test loss: 0.51002 | Test accuracy: 82.38%

Epoch:4
New best test loss is 82.38455414012739. Model Saved
Train Loss:0.455817|Train Acc:84.730659%
Test loss: 0.48121 | Test accuracy: 83.57%

Epoch:5
New best test loss is 83.56886942675159. Model Saved
Train Loss:0.425882|Train Acc:85.901535%
Test loss: 0.46729 | Test accuracy: 84.41%

Epoch:6
New best test loss is 84.4148089

4201.673362150001

In [None]:
def testing(data_loader:DataLoader,model:torch.nn.Module,loss_fn:torch.nn.Module,accuracy_fn,device:torch.device=device):
  test_loss,test_acc=0,0
  model.to(device)
  model.eval()

  with torch.inference_mode():
    for X,y in data_loader:
      X,y=X.float().to(device),y.to(device)

      test_pred=model(X)

      test_loss=test_loss+loss_fn(test_pred,y)
      y_pred_class=torch.argmax(torch.softmax(test_pred,dim=1),dim=1)
      test_acc=test_acc+accuracy_fn(y_target=y,y_pred=y_pred_class)

    test_loss/=len(data_loader)
    test_acc/=len(data_loader)

    print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}%\n")

    return test_loss,test_acc

# This is when conitnuing training of a previously saved model
from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="best_model_small_07052024.pth" #modelname
model_save_path=model_path/model_name

#Load testing
sigmoid=nn.Sigmoid()
loss_fn=nn.CrossEntropyLoss()#Loss Function



model_small=MobileNetV1_prunedblock6()
model_small.load_state_dict(torch.load(f=model_save_path))
#model_load.state_dict()
optimizer1=torch.optim.Adam(params=model_small.parameters(),lr=0.001,weight_decay=0.000) #Optimizer


lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)
start_timer=timer()
test_loss,test_acc=testing(data_loader=test_dataloader,model=model_small,loss_fn=loss_fn,accuracy_fn=accuracy_fn)
best_loss=test_loss
best_loss
end_timer=timer()
print_test_time(start=start_timer,end=end_timer,device=device)

Test loss: 0.33259 | Test accuracy: 88.80%

Test time on cuda: 9.756 seconds


9.756414778000021

##Train small network from Random initialization

Trained the pruned network without copying initialization from the original network. The pruned network is trained with random values initialized.

In [None]:
test=img
test.shape
model_small_random=MobileNetV1_prunedblock6()
test_input=test
test_input.shape
test_model=model_small_random(test_input)
test_model.shape

total_params = sum(p.numel() for p in model_small_random.parameters())
print("Total number of parameters: ", total_params)

loss_fn=nn.CrossEntropyLoss()#Loss Function


optimizer1=torch.optim.Adam(params=model_small_random.parameters(),lr=0.0005,weight_decay=0.001) #Optimizer


lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)

Total number of parameters:  310794


In [None]:
summary(model_small_random, input_size=(1, 3, 224, 224))

In [None]:
lr_scheduler = CosineAnnealingLR(optimizer1, T_max=500)
##Training and Testing Loop


#Training Loop

from tqdm.auto import tqdm
import torch

train_loss_list=[]
train_acc_list=[]
test_loss_list=[]
test_acc_list=[]

epochs=50 #epochs
best_acc=0
patience=70
max_patience_after_revert = 30
current_patience=0

from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="best_model_small_random.pth" #modelname
model_save_path=model_path/model_name



# Create model save


reverted=False
start_timer=timer()
for epoch in tqdm(range(epochs)):
  train_loss,train_acc=training(data_loader=train_dataloader,model=model_small_random,loss_fn=loss_fn,optimizer=optimizer1,accuracy_fn=accuracy_fn)
  test_loss,test_acc=testing(data_loader=test_dataloader,model=model_small_random,loss_fn=loss_fn,accuracy_fn=accuracy_fn)
  print(f"Epoch:{epoch}")

  if (test_acc>best_acc):
    best_acc=test_acc
    model_small.state_dict()
    best_model_state_dict=model.state_dict()
    torch.save(model_small.state_dict(),f=model_save_path)
    print(f"New best test loss is {test_acc}. Model Saved")
    current_patience=0 #reset patience
    patience_after_revert=0
    reverted=False
  else:
    current_patience+=1

    if current_patience>=patience and not reverted:
      print(f"No improvement for{patience} epochs. Reverting to the best model")
      model.load_state_dict(best_model_state_dict)
      patience_after_revert=0
      reverted=True

  if reverted:
    patience_after_revert+=1
    if patience_after_revert>=max_patience_after_revert:
      print(f"Continue for {max_patience_after_revert} epochs after revert")

      if test_acc>=best_acc:
       print("Test loss still not improving reverting to best model")
       model.load_state_dict(best_model_state_dict)
      patience_after_revert=0


from pathlib import Path
model_path=Path("models")
model_path.mkdir(parents=True,exist_ok=True)
model_name="last_model_small_random.pth" #modelname
model_save_path=model_path/model_name
# Create model save


torch.save(model_small.state_dict(),f=model_save_path)
print(f"Last model is saved with test loss:{test_acc}")
model_small.state_dict()

end_timer=timer()
print_train_time(start=start_timer,end=end_timer,device=device)


  0%|          | 0/50 [00:00<?, ?it/s]

Train Loss:1.559261|Train Acc:46.635230%
Test loss: 1.21557 | Test accuracy: 56.86%

Epoch:0
New best test loss is 56.857085987261144. Model Saved
Train Loss:1.027460|Train Acc:64.663923%
Test loss: 0.91895 | Test accuracy: 68.62%

Epoch:1
New best test loss is 68.62062101910828. Model Saved
Train Loss:0.828577|Train Acc:71.795077%
Test loss: 0.81014 | Test accuracy: 71.87%

Epoch:2
New best test loss is 71.86504777070064. Model Saved
Train Loss:0.702913|Train Acc:76.276774%
Test loss: 0.70300 | Test accuracy: 76.21%

Epoch:3
New best test loss is 76.21417197452229. Model Saved
Train Loss:0.622189|Train Acc:78.990169%
Test loss: 0.60140 | Test accuracy: 80.16%

Epoch:4
New best test loss is 80.15525477707007. Model Saved
Train Loss:0.562367|Train Acc:81.066176%
Test loss: 0.55870 | Test accuracy: 80.62%

Epoch:5
New best test loss is 80.62300955414013. Model Saved
Train Loss:0.516452|Train Acc:82.606698%
Test loss: 0.52745 | Test accuracy: 81.71%

Epoch:6
New best test loss is 81.70780

4225.827475759999