# Practice: Model compression in Deep Learning

## Typical process of the low-rank compression
#### Normal Training ⇒ Rank Selection *(excluded from this practice)* ⇒ Low-rank compression ⇒ Fine-tuning

## Question
1. 코드에서 빈 부분을 채우세요.
2. 3가지의 rank setting에 대해서 성능 비교를 수행하세요.
 - R=[20, 100, 200, 8]
 - R=[15, 50, 100, 6]
 - R=[10, 10, 50, 3]
3. 3개의 compressed model에 대해 Fine-tuning을 수행한 뒤 성능 비교를 수행하세요.

## Library import

In [1]:
import torch.nn as nn
import torch
from torch.nn.init import xavier_uniform_
from collections import OrderedDict
from torch.utils.data import TensorDataset, DataLoader
from torchvision import datasets
from torch import optim
import torchvision
import time
import torch
from torch import nn
from collections import OrderedDict
from scipy.linalg import svd
import numpy as np
import copy

## Define a model

In [2]:
def _weights_init(m):
    classname = m.__class__.__name__
    if isinstance(m, nn.Linear):
        xavier_uniform_(m.weight)
        m.bias.data.fill_(0.0)

class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        return self.lambd(x)


class LeNet5(nn.Module):
    def __init__(self, dropout, nonlinearity):
        super(LeNet5, self).__init__()
        self.special = True
        filters = [(20, 5), (50, 5)]
        layers = [(800, 500), (500, 10)]

        cfg = []
        cfg.append(['init_reshape', LambdaLayer(lambda x: x.view(x.size(0), 1,28,28))])
        for i, f in enumerate(filters):
            prev = 1 if i==0 else filters[i-1][0]
            cfg.append(('compressible_' + str(i), nn.Conv2d(prev, f[0], f[1])))
            cfg.append(('nonlineairy_'+str(i), nonlinearity()))
            cfg.append(('maxpool_'+str(i), nn.MaxPool2d(kernel_size=(2,2), stride=2)))


        cfg.append(['reshape', LambdaLayer(lambda x: x.view(x.size(0),-1))])
        for i, l in enumerate(layers):
            cfg.append(('compressible_' + str(i+len(filters)), nn.Linear(*l)))
            if i != len(layers)-1:
                # only non terminal layers have nonlinearity and (possible) dropouts
                cfg.append(('nonlinearity_' + str(i+len(filters)), nonlinearity()))
                if dropout:
                    cfg.append(('drop_'+str(i+len(filters)), nn.Dropout()))

        self.output = nn.Sequential(OrderedDict(cfg))
        self.apply(_weights_init)
    def forward(self, input):
        h = self.output(input)
        return h

def lenet5_classic():
    return LeNet5(dropout=False, nonlinearity=lambda: nn.ReLU(True))

## Define datasets

In [3]:
dataset = 'MNIST'
batch_size = 256
n_workers = 2

def mnist_data():
    mnist_train = torchvision.datasets.MNIST(root='./datasets/', train=True, download=True)
    mnist_test = torchvision.datasets.MNIST(root='./datasets/', train=False, download=True)

    train_data = mnist_train.data.to(torch.float) / 255.
    test_data = mnist_test.data.to(torch.float) / 255.
    mean_image = torch.mean(train_data, dim=0)

    train_data -= mean_image
    test_data -= mean_image

    train_labels = mnist_train.targets
    test_labels = mnist_test.targets

    our_mnist = {
        'train_data': train_data, 'test_data': test_data,
        'train_labels': train_labels, 'test_labels': test_labels
    }
    return our_mnist

data = mnist_data()
train_data = TensorDataset(data['train_data'], data['train_labels'])
test_data = TensorDataset(data['test_data'], data['test_labels'])

train_loader = DataLoader(train_data, num_workers=n_workers, batch_size=batch_size, shuffle=True, pin_memory=False)
test_loader = DataLoader(test_data, num_workers=n_workers, batch_size=batch_size, shuffle=False, pin_memory=False)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 33967250.86it/s]


Extracting ./datasets/MNIST/raw/train-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 44650089.87it/s]


Extracting ./datasets/MNIST/raw/train-labels-idx1-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 31328782.51it/s]


Extracting ./datasets/MNIST/raw/t10k-images-idx3-ubyte.gz to ./datasets/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7556734.93it/s]


Extracting ./datasets/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./datasets/MNIST/raw



## Normal Training

In [22]:
# Prepare a model
model = lenet5_classic()
model.cuda()
print(model)

# Hyper-parameters for training
lr = 0.1
lr_decay = 0.99
momentum = 0.9
epochs = 100
start_epoch = 0
print_freq = 20
checkpoint = 20

# Define an optimizer and a scheduler
optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, nesterov=True)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_decay, last_epoch=start_epoch - 1)

# Define the funcions required to training
def my_eval(x, target, model):
    out_ = model.forward(x)
    return out_, torch.nn.functional.cross_entropy(out_, target)

def format_time(seconds):
    if seconds < 60:
        return '{:.1f}s.'.format(seconds)
    if seconds < 3600:
        return '{:d}m. {}'.format(int(seconds//60), format_time(seconds%60))
    if seconds < 3600*24:
        return '{:d}h. {}'.format(int(seconds//3600), format_time(seconds%3600))
    return '{:d}d. {}'.format(int(seconds//(3600*24)), format_time(seconds%(3600*24)))

def compute_acc_loss(forward_func, data_loader, model):
    correct_cnt, ave_loss = 0, 0
    for batch_idx, (x, target) in enumerate(data_loader):
        with torch.no_grad():
            target = target.cuda()
            score, loss = forward_func(x.cuda(), target, model)
            _, pred_label = torch.max(score.data, 1)
            correct_cnt += (pred_label == target.data).sum().item()
            ave_loss += loss.data.item() * len(x)
    accuracy = correct_cnt * 1.0 / len(data_loader.dataset)
    print(correct_cnt, len(data_loader.dataset))
    ave_loss /= len(data_loader.dataset)
    return accuracy, ave_loss

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0.0
        self.avg = 0.0
        self.sum = 0.0
        self.count = 0.0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Training
training_time = 0
epoch = 0
all_start_time = time.time()
epoch_time = AverageMeter()

for epoch in range(start_epoch, epochs):
    start_time = time.time()
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        optimizer.zero_grad()
        x, target = x.cuda(), target.cuda()
        out = model.forward(x)
        loss = torch.nn.functional.cross_entropy(out, target)
        loss.backward()
        optimizer.step()
        break
    end_time = time.time()
    epoch_time.update(end_time - start_time)
    training_time = end_time - all_start_time
    model.eval()
    print('Epoch {0} finished in {et.val:.3f}s (avg.: {et.avg:.3f}s). Training for {1}'.format(epoch, format_time(training_time), et=epoch_time))
    print('\tLR: {:.4}'.format(scheduler.get_last_lr()[0]))
    if (epoch+1) % print_freq == 0:
        accuracy, ave_loss = compute_acc_loss(my_eval, train_loader, model)
        print('\ttrain loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
        accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, model)
        print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
    scheduler.step()

    if checkpoint and (epoch+1) % checkpoint == 0:
        # create and save checkpoint here
        to_save = {}
        to_save['model_state'] = model.state_dict()
        to_save['optimizer_state'] = optimizer.state_dict()
        to_save['lr'] = scheduler.get_last_lr()
        to_save['epoch'] = epoch + 1
        torch.save(to_save, './lenet5_checkpoint.pth.tar')

LeNet5(
  (output): Sequential(
    (init_reshape): LambdaLayer()
    (compressible_0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (nonlineairy_0): ReLU(inplace=True)
    (maxpool_0): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
    (compressible_1): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
    (nonlineairy_1): ReLU(inplace=True)
    (maxpool_1): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
    (reshape): LambdaLayer()
    (compressible_2): Linear(in_features=800, out_features=500, bias=True)
    (nonlinearity_2): ReLU(inplace=True)
    (compressible_3): Linear(in_features=500, out_features=10, bias=True)
  )
)
Epoch 0 finished in 0.085s (avg.: 0.085s). Training for 0.1s.
	LR: 0.1
Epoch 1 finished in 0.079s (avg.: 0.082s). Training for 0.2s.
	LR: 0.099
Epoch 2 finished in 0.088s (avg.: 0.084s). Training for 0.3s.
	LR: 0.09801
Epoch 3 finished in 0.086s (avg.: 0.085s). Training for 0.3s.
	LR: 0.0970

## Rank Selection

In [5]:
selected_rank1=[10, 10, 15, 7]
selected_rank2=[5, 5, 7, 5]
selected_rank3=[3, 3, 5, 3]

## Define the functions for low-rank compression

In [17]:
def linear_layer_reparametrizer(sub_module, conv_scheme='scheme_1'):
    W = sub_module.weight.data.cpu().numpy()

    init_shape = None
    n,m,d1,d2 = None, None, None, None
    if isinstance(sub_module, nn.Conv2d):
        if conv_scheme == 'scheme_1':
            ####################
            # implement here
            init_shape = W.shape
            reshaped = W.reshape([init_shape[0], -1])
            W = reshaped
            ####################
        elif conv_scheme == 'scheme_2':
            raise NotImplementedError("We did not implement scheme-2 in this pratice.")

    u, s, v = svd(W, full_matrices=False)
    from numpy.linalg import matrix_rank

    r = sub_module.rank_ if hasattr(sub_module, 'rank_') else sub_module.selected_rank_ if hasattr(sub_module, 'selected_rank_') else int(matrix_rank(W))

    if r < np.min(W.shape):
        diag = np.diag(s[:r] ** 0.5)
        U = u[:, :r] @ diag
        V = diag @ v[:r, :]
        new_W = U @ V


        from numpy.linalg import norm
        m,n = W.shape
        if r > np.floor(m*n/(m+n)):
            raise RankNotEfficientException("Selected rank doesn't contribute to any savings")
        bias = sub_module.bias is not None
        if isinstance(sub_module, nn.Linear): # (out_c, in_c) : matrix / (out_c, r) & (r, in_c)
            ####################
            # implement here
            #l1 = nn.Linear(in_features = sub_module.in_features, out_feature = r, bias = False) # (r, in_c)
            #l2 = nn.Linear(in_features = r, out_feature = sub_module.out_feature, bias = bias) # (out_c, r)

            l1 = nn.Linear(in_features=sub_module.in_features, out_features=r, bias=False)
            l2 = nn.Linear(in_features=r, out_features=sub_module.out_features, bias=bias)
            ####################
            l1.weight.data = torch.from_numpy(V)
            l2.weight.data = torch.from_numpy(U)
            if bias:
                l2.bias.data = sub_module.bias.data
            return l1, l2
        else:
            if conv_scheme == 'scheme_1':
                ####################
                # implement here # (out_c, in_c x k_size x k_size) --> (out_c, r) & (r, in_c x k_size x k_size)
                # l1 = nn.Conv2d(in_channels = sub_module.in_channels, # --> (out_c, r x 1 x 1)
                #                out_channels = 4,
                #                kernel_size = sub_module.kernel_size,
                #                stride = sub_module.stide,
                #                padding = sub_module.padding,
                #                dilation = sub_module.dilation,
                #                groups = sub_module.groups,
                #                bias = False) # (r, in_c x k_size x k_size)
                # l2 = nn.Conv2d(in_channels = r, out_channels = sub_module.out_channels, kernel_size = 1, bias = bias)

                l1 = nn.Conv2d(in_channels=sub_module.in_channels,
                               out_channels=r,
                               kernel_size=sub_module.kernel_size,
                               stride=sub_module.stride,
                               padding=sub_module.padding,
                               dilation=sub_module.dilation,
                               groups=sub_module.groups,
                               bias=False)

                l2 = nn.Conv2d(in_channels=r, out_channels=sub_module.out_channels,
                               kernel_size=1,
                               bias=bias)
                ####################
                l1.weight.data = torch.from_numpy(V.reshape([-1, *init_shape[1:]]))
                l2.weight.data = torch.from_numpy(U[:, :, None, None])

                if bias:
                    l2.bias.data = sub_module.bias.data

                return l1, l2
            elif conv_scheme == 'scheme_2':
                raise NotImplementedError("We did not implement scheme-2 in this pratice.")


def reparametrization_helper(list_of_modules, conv_scheme, old_weight_decay=True):
    new_sequence = []
    items = list_of_modules.items()
    decayed_values_repar = []
    decayed_valued_old = []
    for i, (name, sub_module) in enumerate(items):
        if isinstance(sub_module, nn.Sequential):
            dv_repar_sub, dv_old_sub, nseq_sub = reparametrization_helper(sub_module._modules, conv_scheme=conv_scheme,old_weight_decay=old_weight_decay)
            new_sequence.append((name, nn.Sequential(OrderedDict(nseq_sub))))
            decayed_values_repar.extend(dv_repar_sub)
            decayed_valued_old.extend(dv_old_sub)
        elif isinstance(sub_module, nn.Linear) or isinstance(sub_module, nn.Conv2d):
            try:
                l1, l2 = linear_layer_reparametrizer(sub_module, conv_scheme=conv_scheme)
                new_sequence.append((name + '_V', l1))
                new_sequence.append((name + '_U', l2))
                decayed_values_repar.append((l1, l2))

            except Exception as e:
                new_sequence.append((name, sub_module))
                decayed_valued_old.append(sub_module.weight)
        else:
            new_sequence.append((name, sub_module))
            if old_weight_decay and hasattr(sub_module, 'weight'):
                decayed_valued_old.append(sub_module.weight)
    return decayed_values_repar, decayed_valued_old, new_sequence


def reparametrize_low_rank(model, old_weight_decay=True):
    decayed_values_repar, decayed_valued_old, new_sequence = reparametrization_helper(model.output._modules, conv_scheme='scheme_1', old_weight_decay=old_weight_decay)
    model.output = nn.Sequential(OrderedDict(new_sequence))

    def weight_decay():
        sum_ = torch.autograd.Variable(torch.FloatTensor([0.0]).cuda())
        for x in decayed_valued_old:
            sum_ += torch.sum(x**2)
        for v,u in decayed_values_repar:
            v = v.weight
            u = u.weight
            u_ = u.view(u.size()[0], -1)
            v_ = v.view(u_.size()[1], -1)
            sum_ += torch.sum(torch.matmul(u_,v_)**2)
        return sum_
    model.weight_decay = weight_decay
    return nn.Sequential(OrderedDict(new_sequence))

## Compress the model(i.e., compressed_model1) using first ranks

In [18]:
compressed_model1 = copy.deepcopy(model)
for i, module in enumerate([x for x in compressed_model1.modules() if isinstance(x, nn.Conv2d) or isinstance(x, nn.Linear)]):
      module.selected_rank_ = selected_rank1[i]
      print(module.selected_rank_)
reparametrize_low_rank(compressed_model1)
compressed_model1.cuda()
compressed_model1.eval()
accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model1)
print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))

10
10
15
7
9204 10000
	test  loss: 0.261443, accuracy: 0.9204


## Compress the model(i.e., compressed_model1) using second ranks

In [19]:
compressed_model2 = copy.deepcopy(model)
for i, module in enumerate([x for x in compressed_model2.modules() if isinstance(x, nn.Conv2d) or isinstance(x, nn.Linear)]):
      module.selected_rank_ = selected_rank2[i]
      print(module.selected_rank_)
reparametrize_low_rank(compressed_model2)
compressed_model2.cuda()
compressed_model2.eval()
accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model2)
print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))

5
5
7
5
7363 10000
	test  loss: 0.765986, accuracy: 0.7363


## Compress the model(i.e., compressed_model1) using third ranks

In [20]:
compressed_model3 = copy.deepcopy(model)
for i, module in enumerate([x for x in compressed_model3.modules() if isinstance(x, nn.Conv2d) or isinstance(x, nn.Linear)]):
      module.selected_rank_ = selected_rank3[i]
      print(module.selected_rank_)
reparametrize_low_rank(compressed_model3)
compressed_model3.cuda()
compressed_model3.eval()
accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model3)
print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))

3
3
5
3
3167 10000
	test  loss: 1.675865, accuracy: 0.3167


## Fine-tune the compressed_model1

In [21]:
# fine-tuning

batch_size = 256
lr = 0.02
lr_decay = 0.99
momentum = 0.9
epochs = 100
dataset = 'MNIST'
n_workers = 2
start_epoch = 0
print_freq = 20
checkpoint = 20


training_time = 0
epoch = 0
all_start_time = time.time()
epoch_time = AverageMeter()

optimizer = torch.optim.SGD(compressed_model1.parameters(), lr, momentum=momentum, nesterov=True)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_decay, last_epoch=start_epoch - 1)

for epoch in range(start_epoch, epochs):
    start_time = time.time()
    compressed_model1.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        optimizer.zero_grad()
        x, target = x.cuda(), target.cuda()
        out = compressed_model1.forward(x)
        loss = torch.nn.functional.cross_entropy(out, target)
        loss.backward()
        optimizer.step()
        break
    end_time = time.time()
    epoch_time.update(end_time - start_time)
    training_time = end_time - all_start_time
    compressed_model1.eval()
    print('Epoch {0} finished in {et.val:.3f}s (avg.: {et.avg:.3f}s). Training for {1}'.format(epoch, format_time(training_time), et=epoch_time))
    print('\tLR: {:.4}'.format(scheduler.get_last_lr()[0]))
    if (epoch+1) % print_freq == 0:
        accuracy, ave_loss = compute_acc_loss(my_eval, train_loader, compressed_model1)
        print('\ttrain loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
        accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model1)
        print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
    scheduler.step()

    if checkpoint and (epoch+1) % checkpoint == 0:
        # create and save checkpoint here
        to_save = {}
        to_save['model_state'] = compressed_model1.state_dict()
        to_save['optimizer_state'] = optimizer.state_dict()
        to_save['lr'] = scheduler.get_last_lr()
        to_save['epoch'] = epoch + 1
        torch.save(to_save, './compressed_lenet5_v1_checkpoint.pth.tar')

Epoch 0 finished in 0.105s (avg.: 0.105s). Training for 0.1s.
	LR: 0.02
Epoch 1 finished in 0.096s (avg.: 0.100s). Training for 0.2s.
	LR: 0.0198
Epoch 2 finished in 0.085s (avg.: 0.095s). Training for 0.3s.
	LR: 0.0196
Epoch 3 finished in 0.091s (avg.: 0.094s). Training for 0.4s.
	LR: 0.01941
Epoch 4 finished in 0.087s (avg.: 0.093s). Training for 0.5s.
	LR: 0.01921
Epoch 5 finished in 0.085s (avg.: 0.091s). Training for 0.6s.
	LR: 0.01902
Epoch 6 finished in 0.092s (avg.: 0.091s). Training for 0.6s.
	LR: 0.01883
Epoch 7 finished in 0.090s (avg.: 0.091s). Training for 0.7s.
	LR: 0.01864
Epoch 8 finished in 0.088s (avg.: 0.091s). Training for 0.8s.
	LR: 0.01845
Epoch 9 finished in 0.092s (avg.: 0.091s). Training for 0.9s.
	LR: 0.01827
Epoch 10 finished in 0.084s (avg.: 0.090s). Training for 1.0s.
	LR: 0.01809
Epoch 11 finished in 0.087s (avg.: 0.090s). Training for 1.1s.
	LR: 0.01791
Epoch 12 finished in 0.085s (avg.: 0.090s). Training for 1.2s.
	LR: 0.01773
Epoch 13 finished in 0.088s

## Fine-tune the compressed_model2

In [23]:
training_time = 0
epoch = 0
all_start_time = time.time()
epoch_time = AverageMeter()

optimizer = torch.optim.SGD(compressed_model2.parameters(), lr, momentum=momentum, nesterov=True)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_decay, last_epoch=start_epoch - 1)

for epoch in range(start_epoch, epochs):
    start_time = time.time()
    compressed_model2.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        optimizer.zero_grad()
        x, target = x.cuda(), target.cuda()
        out = compressed_model2.forward(x)
        loss = torch.nn.functional.cross_entropy(out, target)
        loss.backward()
        optimizer.step()
        break
    end_time = time.time()
    epoch_time.update(end_time - start_time)
    training_time = end_time - all_start_time
    compressed_model2.eval()
    print('Epoch {0} finished in {et.val:.3f}s (avg.: {et.avg:.3f}s). Training for {1}'.format(epoch, format_time(training_time), et=epoch_time))
    print('\tLR: {:.4}'.format(scheduler.get_last_lr()[0]))
    if (epoch+1) % print_freq == 0:
        accuracy, ave_loss = compute_acc_loss(my_eval, train_loader, compressed_model2)
        print('\ttrain loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
        accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model2)
        print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
    scheduler.step()

    if checkpoint and (epoch+1) % checkpoint == 0:
        # create and save checkpoint here
        to_save = {}
        to_save['model_state'] = compressed_model2.state_dict()
        to_save['optimizer_state'] = optimizer.state_dict()
        to_save['lr'] = scheduler.get_last_lr()
        to_save['epoch'] = epoch + 1
        torch.save(to_save, './compressed_lenet5_v2_checkpoint.pth.tar')

Epoch 0 finished in 0.090s (avg.: 0.090s). Training for 0.1s.
	LR: 0.1
Epoch 1 finished in 0.087s (avg.: 0.089s). Training for 0.2s.
	LR: 0.099
Epoch 2 finished in 0.081s (avg.: 0.086s). Training for 0.3s.
	LR: 0.09801
Epoch 3 finished in 0.085s (avg.: 0.086s). Training for 0.3s.
	LR: 0.09703
Epoch 4 finished in 0.083s (avg.: 0.085s). Training for 0.4s.
	LR: 0.09606
Epoch 5 finished in 0.083s (avg.: 0.085s). Training for 0.5s.
	LR: 0.0951
Epoch 6 finished in 0.095s (avg.: 0.086s). Training for 0.6s.
	LR: 0.09415
Epoch 7 finished in 0.084s (avg.: 0.086s). Training for 0.7s.
	LR: 0.09321
Epoch 8 finished in 0.104s (avg.: 0.088s). Training for 0.8s.
	LR: 0.09227
Epoch 9 finished in 0.093s (avg.: 0.088s). Training for 0.9s.
	LR: 0.09135
Epoch 10 finished in 0.083s (avg.: 0.088s). Training for 1.0s.
	LR: 0.09044
Epoch 11 finished in 0.094s (avg.: 0.088s). Training for 1.1s.
	LR: 0.08953
Epoch 12 finished in 0.086s (avg.: 0.088s). Training for 1.2s.
	LR: 0.08864
Epoch 13 finished in 0.094s (

## Fine-tune the compressed_model3

In [24]:
training_time = 0
epoch = 0
all_start_time = time.time()
epoch_time = AverageMeter()

optimizer = torch.optim.SGD(compressed_model3.parameters(), lr, momentum=momentum, nesterov=True)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_decay, last_epoch=start_epoch - 1)

for epoch in range(start_epoch, epochs):
    start_time = time.time()
    compressed_model3.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        optimizer.zero_grad()
        x, target = x.cuda(), target.cuda()
        out = compressed_model3.forward(x)
        loss = torch.nn.functional.cross_entropy(out, target)
        loss.backward()
        optimizer.step()
        break
    end_time = time.time()
    epoch_time.update(end_time - start_time)
    training_time = end_time - all_start_time
    compressed_model3.eval()
    print('Epoch {0} finished in {et.val:.3f}s (avg.: {et.avg:.3f}s). Training for {1}'.format(epoch, format_time(training_time), et=epoch_time))
    print('\tLR: {:.4}'.format(scheduler.get_last_lr()[0]))
    if (epoch+1) % print_freq == 0:
        accuracy, ave_loss = compute_acc_loss(my_eval, train_loader, compressed_model3)
        print('\ttrain loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
        accuracy, ave_loss = compute_acc_loss(my_eval, test_loader, compressed_model3)
        print('\ttest  loss: {:.6f}, accuracy: {:.4f}'.format(ave_loss, accuracy))
    scheduler.step()

    if checkpoint and (epoch+1) % checkpoint == 0:
        # create and save checkpoint here
        to_save = {}
        to_save['model_state'] = compressed_model3.state_dict()
        to_save['optimizer_state'] = optimizer.state_dict()
        to_save['lr'] = scheduler.get_last_lr()
        to_save['epoch'] = epoch + 1
        torch.save(to_save, './compressed_lenet5_v3_checkpoint.pth.tar')

Epoch 0 finished in 0.164s (avg.: 0.164s). Training for 0.2s.
	LR: 0.1
Epoch 1 finished in 0.147s (avg.: 0.155s). Training for 0.3s.
	LR: 0.099
Epoch 2 finished in 0.180s (avg.: 0.164s). Training for 0.5s.
	LR: 0.09801
Epoch 3 finished in 0.201s (avg.: 0.173s). Training for 0.7s.
	LR: 0.09703
Epoch 4 finished in 0.148s (avg.: 0.168s). Training for 0.8s.
	LR: 0.09606
Epoch 5 finished in 0.133s (avg.: 0.162s). Training for 1.0s.
	LR: 0.0951
Epoch 6 finished in 0.162s (avg.: 0.162s). Training for 1.1s.
	LR: 0.09415
Epoch 7 finished in 0.174s (avg.: 0.164s). Training for 1.3s.
	LR: 0.09321
Epoch 8 finished in 0.183s (avg.: 0.166s). Training for 1.5s.
	LR: 0.09227
Epoch 9 finished in 0.188s (avg.: 0.168s). Training for 1.7s.
	LR: 0.09135
Epoch 10 finished in 0.160s (avg.: 0.167s). Training for 1.8s.
	LR: 0.09044
Epoch 11 finished in 0.164s (avg.: 0.167s). Training for 2.0s.
	LR: 0.08953
Epoch 12 finished in 0.110s (avg.: 0.163s). Training for 2.1s.
	LR: 0.08864
Epoch 13 finished in 0.087s (