# **FinalProject** - Group 75

### Generic Utilities



In [0]:
import os
import time
import shutil
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.nn.init as init
from PIL import Image
import numpy as np

In [0]:
PATH = '/gdrive/My Drive/COMP551/FinalProject-Group75'   # path to FinalProject folder

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
n_epoches = 75
batch_size = 64
log_freq = 25
learning_rate = 0.01
momentum = 0.9

cudnn.benchmark = True

In [0]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [0]:
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [0]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = learning_rate * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

### Construct dataset

In [13]:
val_image_path = os.path.join(PATH, 'tiny-imagenet-200', 'val', 'images')

val_ids = []
val_images = {}
print("Loading validation images...")
for i in range(10000):
  id = "val_" + str(i) + ".JPEG"
  val_ids.append(id)
  img = Image.open(val_image_path+"/"+id)
  val_images[id] = img.convert('RGB')
  
  if (i+1)%100==0:
    print(str((i+1)/100) + '% ', end='')
print("")

Loading validation images...
1.0% 2.0% 3.0% 4.0% 5.0% 6.0% 7.0% 8.0% 9.0% 10.0% 11.0% 12.0% 13.0% 14.0% 15.0% 16.0% 17.0% 18.0% 19.0% 20.0% 21.0% 22.0% 23.0% 24.0% 25.0% 26.0% 27.0% 28.0% 29.0% 30.0% 31.0% 32.0% 33.0% 34.0% 35.0% 36.0% 37.0% 38.0% 39.0% 40.0% 41.0% 42.0% 43.0% 44.0% 45.0% 46.0% 47.0% 48.0% 49.0% 50.0% 51.0% 52.0% 53.0% 54.0% 55.0% 56.0% 57.0% 58.0% 59.0% 60.0% 61.0% 62.0% 63.0% 64.0% 65.0% 66.0% 67.0% 68.0% 69.0% 70.0% 71.0% 72.0% 73.0% 74.0% 75.0% 76.0% 77.0% 78.0% 79.0% 80.0% 81.0% 82.0% 83.0% 84.0% 85.0% 86.0% 87.0% 88.0% 89.0% 90.0% 91.0% 92.0% 93.0% 94.0% 95.0% 96.0% 97.0% 98.0% 99.0% 100.0% 


In [14]:
val_annot_path = os.path.join(PATH, 'tiny-imagenet-200', 'val', 'val_annotations.txt')

val_targets = {}
labels = []
print("Loading validation targets...")
with open(val_annot_path, 'r') as fo:
  for line in fo: 
    line = line.split('\t')
    val_targets[line[0]] = line[1]
    if not line[1] in labels:
      labels.append(line[1])

# encode labels
labels.sort()
for id, target in val_targets.items():
  val_targets[id] = labels.index(target)

Loading validation targets...


In [0]:
class Dataset(data.Dataset):
  def __init__(self, ids, inputs, targets, transform=None):
    self.ids = ids
    self.inputs = inputs
    self.targets = targets
    self.transform = transform
  
  def __len__(self):
    return len(self.ids)
  
  def __getitem__(self, index):
    id = self.ids[index]
    
    X = self.inputs[id]
    y = self.targets[id]
    
    if self.transform:
      X = self.transform(X)
      
    return X, y

In [0]:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

train_set = Dataset(val_ids[:8000], val_images, val_targets, transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize
]))
train_loader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

val_set = Dataset(val_ids[8000:], val_images, val_targets, transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    normalize
]))
val_loader = data.DataLoader(val_set, batch_size=batch_size, shuffle=False)

### Define train/validate functions

In [0]:
def train(train_loader, model, criterion, optimizer, epoch):
  batch_time = AverageMeter()
  losses = AverageMeter()
  top1 = AverageMeter()
  top5 = AverageMeter()
  
  model.train()
  
  end = time.time()
  for i, (input, target) in enumerate(train_loader):
    target = target.cuda(async=True)
    input_var = torch.autograd.Variable(input)
    target_var = torch.autograd.Variable(target)
    
    # compute output
    output = model(input_var)
    loss = criterion(output, target_var)
    losses.update(loss.data.item(), input.size()[0])
    
    # measure accuracy 
    prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
    top1.update(prec1, input.size()[0])
    top5.update(prec5, input.size()[0])
    
    # compute gradient and do backpropagation 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # measure time
    batch_time.update(time.time() - end)
    end = time.time()
    
    # print to screen
    if (i+1)%log_freq == 0:
      print('Epoch: {0}[{1}/{2}]\t'
            'Time used: {batch_time.val:.3f} (avg: {batch_time.avg:.3f})\t'
            'Loss: {loss.val:.4f} (avg: {loss.avg:.4f})\t'
            'Top1: {top1.val:.3f} (avg: {top1.avg:.3f})\t'
            'Top5: {top5.val:.3f} (avg: {top5.avg:.3f})\t'.format(epoch, i+1, len(train_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5))

In [0]:
def validate(valid_loader, model, criterion):
  batch_time = AverageMeter()
  losses = AverageMeter()
  top1 = AverageMeter()
  top5 = AverageMeter()
  
  model.eval()
  
  end = time.time()
  for i, (input, target) in enumerate(valid_loader):
    target = target.cuda(async=True)
    input_var = torch.autograd.Variable(input)
    target_var = torch.autograd.Variable(target)
    
    # compute output
    output = model(input_var)
    loss = criterion(output, target_var)
    losses.update(loss.data.item(), input.size()[0])
    
    # measure accuracy 
    prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
    top1.update(prec1, input.size()[0])
    top5.update(prec5, input.size()[0])
    
    # measure time
    batch_time.update(time.time() - end)
    end = time.time()
      
  print("Loss: {loss.val:.4f}\tTop 1 accuracy: {top1.avg:.3f}\tTop 5 accuracy: {top5.avg:.3f}".format(loss=losses, top1=top1, top5=top5))
  
  return batch_time.avg, top1.avg, top5.avg

In [0]:
def test_model(model):
  model_name = model.name()
  model = torch.nn.DataParallel(model).cuda()
  criterion = nn.CrossEntropyLoss().cuda()
  optimizer = optim.SGD(model.parameters(), learning_rate, momentum=momentum)
  
  best_batch_time = 0
  best_top1 = 0
  best_top5 = 0
  for epoch in range(n_epoches):
    adjust_learning_rate(optimizer, epoch)
    
    # train the model
    print("Training...")
    train(train_loader, model, criterion, optimizer, epoch)
    print("")
    
    # validate
    print("Validating...")
    batch_time, top1, top5 = validate(val_loader, model, criterion)
    print("")
    
    # save checkpoint 
    torch.save({'epoch': epoch, 
               'model_state_dict': model.state_dict(),
               'optimizer_state_dict': model.state_dict(),
               'top1_accuracy': top1,
               'top5_accuracy': top5},
              PATH+'/output/'+model_name+'_checkpoint.tar')
    
    # save the model with the best top5 accuracy
    if top5 > best_top5:
      best_batch_time = batch_time
      best_top1 = top1
      best_top5 = top5
      torch.save({'epoch': epoch, 
               'model_state_dict': model.state_dict(),
               'optimizer_state_dict': model.state_dict(),
               'top1_accuracy': top1,
               'top5_accuracy': top5},
              PATH+'/output/'+model_name+'_best.tar')
  
  return best_batch_time, best_top1, best_top5

## Models

### SqueezeNet

In [0]:
class Fire(nn.Module):

    def __init__(self, inplanes, squeeze_planes,
                 expand1x1_planes, expand3x3_planes):
        super(Fire, self).__init__()
        self.inplanes = inplanes
        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
        self.squeeze_activation = nn.ReLU(inplace=True)
        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
                                   kernel_size=1)
        self.expand1x1_activation = nn.ReLU(inplace=True)
        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
                                   kernel_size=3, padding=1)
        self.expand3x3_activation = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.squeeze_activation(self.squeeze(x))
        return torch.cat([
            self.expand1x1_activation(self.expand1x1(x)),
            self.expand3x3_activation(self.expand3x3(x))
        ], 1)

In [0]:
class SqueezeNet(nn.Module):

    def __init__(self, version=1.0, num_classes=200):
        super(SqueezeNet, self).__init__()
        if version not in [1.0, 1.1]:
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1.0 or 1.1 expected".format(version=version))
        self.num_classes = num_classes
        if version == 1.0:
            self.features = nn.Sequential(
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(512, 64, 256, 256),
            )
        else:
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AvgPool2d(13, stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)
      
    def name(self):
      return "SqueezeNet"

In [0]:
class SqueezeNet_MetaParam(nn.Module):

    def __init__(self, version=1.0, num_classes=200, base=128, incr=128, pct=0.5, freq=2, sr=0.125):
        super(SqueezeNet_MetaParam, self).__init__()
        if version not in [1.0, 1.1]:
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1.0 or 1.1 expected".format(version=version))
            
        self.num_classes = num_classes
        self.base = base
        self.incr = incr
        self.pct = pct
        self.freq = freq
        self.sr = sr
        
        if version == 1.0:
            self.features = nn.Sequential(
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, int(base*sr), int(base*(1-pct)), int(base*pct)),
                Fire(int(base), int((base+1//freq*incr)*sr), int((base+1//freq*incr)*(1-pct)), int((base+1//freq*incr)*pct)),
                Fire(int(base+1//freq*incr), int((base+2//freq*incr)*sr), int((base+2//freq*incr)*(1-pct)), int((base+2//freq*incr)*pct)),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(int(base+2//freq*incr), int((base+3//freq*incr)*sr), int((base+3//freq*incr)*(1-pct)), int((base+3//freq*incr)*pct)),
                Fire(int(base+3//freq*incr), int((base+4//freq*incr)*sr), int((base+4//freq*incr)*(1-pct)), int((base+4//freq*incr)*pct)),
                Fire(int(base+4//freq*incr), int((base+5//freq*incr)*sr), int((base+5//freq*incr)*(1-pct)), int((base+5//freq*incr)*pct)),
                Fire(int(base+5//freq*incr), int((base+6//freq*incr)*sr), int((base+6//freq*incr)*(1-pct)), int((base+6//freq*incr)*pct)),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(int(base+6//freq*incr), int((base+7//freq*incr)*sr), int((base+7//freq*incr)*(1-pct)), int((base+7//freq*incr)*pct)),
            )
        else:
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(int(base+7//freq*incr), self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AvgPool2d(13, stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)
      
    def name(self):
      return "SqueezeNet_MetaParam"

### Downsampling at different times 

In [0]:
class SqueezeNet_Late_Pooling(nn.Module):

    def __init__(self, version=1.0, num_classes=200):
        super(SqueezeNet_Late_Pooling, self).__init__()
        if version not in [1.0, 1.1]:
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1.0 or 1.1 expected".format(version=version))
        self.num_classes = num_classes
        if version == 1.0:
            self.features = nn.Sequential(
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(384, 64, 256, 256),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(512, 64, 256, 256),
            )
        else:
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AvgPool2d(13, stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)
      
    def name(self):
      return "SqueezeNet_Late_Pooling"

In [0]:
class SqueezeNet_Very_Early_Pooling(nn.Module):

    def __init__(self, version=1.0, num_classes=200):
        super(SqueezeNet_Very_Early_Pooling, self).__init__()
        if version not in [1.0, 1.1]:
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1.0 or 1.1 expected".format(version=version))
        self.num_classes = num_classes
        if version == 1.0:
            self.features = nn.Sequential(
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        else:
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AvgPool2d(13, stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)
      
    def name(self):
      return "SqueezeNet_Very_Early_Pooling"

In [0]:
class SqueezeNet_Semi_Early_Pooling(nn.Module):

    def __init__(self, version=1.0, num_classes=200):
        super(SqueezeNet_Very_Early_Pooling, self).__init__()
        if version not in [1.0, 1.1]:
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1.0 or 1.1 expected".format(version=version))
        self.num_classes = num_classes
        if version == 1.0:
            self.features = nn.Sequential(
                
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),ker
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        else:
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AvgPool2d(13, stride=1)
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)
      
    def name(self):
      return "SqueezeNet_Very_Early_Pooling"

## Experiments

### Original Model

In [0]:
sq_model = SqueezeNet(version=1.0)
batch_time, top1, top5 = test_model(sq_model)



Training...
Epoch: 0[25/125]	Time used: 0.362 (avg: 0.496)	Loss: 5.2963 (avg: 5.3106)	Top1: 0.000 (avg: 0.125)	Top5: 0.000 (avg: 1.938)	
Epoch: 0[50/125]	Time used: 0.363 (avg: 0.432)	Loss: 5.2980 (avg: 5.3050)	Top1: 0.000 (avg: 0.188)	Top5: 3.125 (avg: 1.906)	
Epoch: 0[75/125]	Time used: 0.362 (avg: 0.411)	Loss: 5.3023 (avg: 5.3028)	Top1: 0.000 (avg: 0.354)	Top5: 0.000 (avg: 2.146)	
Epoch: 0[100/125]	Time used: 0.360 (avg: 0.401)	Loss: 5.2994 (avg: 5.3018)	Top1: 0.000 (avg: 0.344)	Top5: 3.125 (avg: 2.094)	
Epoch: 0[125/125]	Time used: 0.370 (avg: 0.395)	Loss: 5.2984 (avg: 5.3011)	Top1: 0.000 (avg: 0.375)	Top5: 1.562 (avg: 2.038)	

Validating...
Loss: 5.2947	Top 1 accuracy: 0.400	Top 5 accuracy: 2.200

Training...
Epoch: 1[25/125]	Time used: 0.365 (avg: 0.362)	Loss: 5.2983 (avg: 5.2971)	Top1: 0.000 (avg: 0.500)	Top5: 4.688 (avg: 2.938)	
Epoch: 1[50/125]	Time used: 0.362 (avg: 0.366)	Loss: 5.2948 (avg: 5.2973)	Top1: 0.000 (avg: 0.406)	Top5: 3.125 (avg: 2.688)	
Epoch: 1[75/125]	Time used

### Metaparameters

In [47]:
# Squeeze ratio
sr_list = [0.125, 0.25, 0.5, 0.75, 1.0]
sr_top1s = []
sr_top5s = []
sr_batch_times = []
for sr in sr_list:
  model = SqueezeNet_MetaParam(version=1.0, sr=sr)
  batch_time, top1, top5 = test_model(model)
  sr_top1s.append(top1)
  sr_top5s.append(top5)
  sr_batch_times.append(batch_time)
  print("SR = {0}: top1 = {1} \t top5 = {2} \t batch time = {3}\n".format(sr, top1, top5, batch_time))



Training...
Epoch: 0[25/125]	Time used: 0.352 (avg: 0.356)	Loss: 5.3022 (avg: 5.3025)	Top1: 0.000 (avg: 0.562)	Top5: 1.562 (avg: 2.500)	
Epoch: 0[50/125]	Time used: 0.363 (avg: 0.359)	Loss: 5.2944 (avg: 5.3000)	Top1: 1.562 (avg: 0.531)	Top5: 1.562 (avg: 2.469)	
Epoch: 0[75/125]	Time used: 0.350 (avg: 0.359)	Loss: 5.2947 (avg: 5.2975)	Top1: 0.000 (avg: 0.521)	Top5: 0.000 (avg: 2.458)	
Epoch: 0[100/125]	Time used: 0.352 (avg: 0.360)	Loss: 5.2583 (avg: 5.2950)	Top1: 0.000 (avg: 0.500)	Top5: 3.125 (avg: 2.391)	
Epoch: 0[125/125]	Time used: 0.351 (avg: 0.360)	Loss: 5.2488 (avg: 5.2903)	Top1: 0.000 (avg: 0.500)	Top5: 3.125 (avg: 2.463)	

Validating...
Loss: 5.2625	Top 1 accuracy: 1.050	Top 5 accuracy: 2.800

Training...
Epoch: 1[25/125]	Time used: 0.353 (avg: 0.352)	Loss: 5.2764 (avg: 5.2706)	Top1: 0.000 (avg: 0.562)	Top5: 1.562 (avg: 2.938)	
Epoch: 1[50/125]	Time used: 0.361 (avg: 0.356)	Loss: 5.2853 (avg: 5.2683)	Top1: 1.562 (avg: 0.562)	Top5: 3.125 (avg: 3.031)	
Epoch: 1[75/125]	Time used

In [49]:
print(sr_top1s)
print(sr_top5s)

[tensor(13.8000, device='cuda:0'), tensor(16.1500, device='cuda:0'), tensor(15.6000, device='cuda:0'), tensor(17.3000, device='cuda:0'), tensor(16.7500, device='cuda:0')]
[tensor(34.2000, device='cuda:0'), tensor(38.6500, device='cuda:0'), tensor(38.2000, device='cuda:0'), tensor(38.4000, device='cuda:0'), tensor(38.4000, device='cuda:0')]


In [25]:
# pct_3x3
pct_list = [0.0078125, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875]
pct_top1s = []
pct_top5s = []
pct_batch_times = []
for pct in pct_list:
  model = SqueezeNet_MetaParam(version=1.0, pct=pct)
  batch_time, top1, top5 = test_model(model)
  pct_top1s.append(top1)
  pct_top5s.append(top5)
  pct_batch_times.append(batch_time)
  print("pct_3x3 = {0}: top1 = {1} \t top5 = {2} \t batch time = {3}\n".format(pct, top1, top5, batch_time))



Training...
Epoch: 0[25/125]	Time used: 0.334 (avg: 0.458)	Loss: 5.2963 (avg: 5.3144)	Top1: 0.000 (avg: 0.250)	Top5: 0.000 (avg: 1.875)	
Epoch: 0[50/125]	Time used: 0.334 (avg: 0.399)	Loss: 5.2989 (avg: 5.3068)	Top1: 0.000 (avg: 0.312)	Top5: 0.000 (avg: 1.938)	
Epoch: 0[75/125]	Time used: 0.331 (avg: 0.379)	Loss: 5.2989 (avg: 5.3037)	Top1: 0.000 (avg: 0.438)	Top5: 3.125 (avg: 2.125)	
Epoch: 0[100/125]	Time used: 0.331 (avg: 0.369)	Loss: 5.2497 (avg: 5.3010)	Top1: 1.562 (avg: 0.469)	Top5: 4.688 (avg: 2.219)	
Epoch: 0[125/125]	Time used: 0.336 (avg: 0.363)	Loss: 5.2765 (avg: 5.2967)	Top1: 0.000 (avg: 0.488)	Top5: 3.125 (avg: 2.312)	

Validating...
Loss: 5.1562	Top 1 accuracy: 0.450	Top 5 accuracy: 2.350

Training...
Epoch: 1[25/125]	Time used: 0.331 (avg: 0.329)	Loss: 5.2758 (avg: 5.2724)	Top1: 0.000 (avg: 0.375)	Top5: 4.688 (avg: 3.312)	
Epoch: 1[50/125]	Time used: 0.330 (avg: 0.333)	Loss: 5.2763 (avg: 5.2759)	Top1: 0.000 (avg: 0.438)	Top5: 0.000 (avg: 3.156)	
Epoch: 1[75/125]	Time used

In [23]:
# base_e
base_list = [64, 128, 192, 256]
base_top1s = []
base_top5s = []
base_batch_times = []
for base in base_list:
  model = SqueezeNet_MetaParam(version=1.0, base=base)
  batch_time, top1, top5 = test_model(model)
  base_top1s.append(top1)
  base_top5s.append(top5)
  base_batch_times.append(batch_time)
  print("base_e = {0}: top1 = {1} \t top5 = {2} \t batch time = {3}\n".format(base, top1, top5, batch_time))
  
torch.cuda.empty_cache()



Training...
Epoch: 0[25/125]	Time used: 0.307 (avg: 0.408)	Loss: 5.2959 (avg: 5.3053)	Top1: 0.000 (avg: 0.312)	Top5: 3.125 (avg: 1.812)	
Epoch: 0[50/125]	Time used: 0.298 (avg: 0.359)	Loss: 5.2972 (avg: 5.3021)	Top1: 0.000 (avg: 0.281)	Top5: 3.125 (avg: 1.844)	
Epoch: 0[75/125]	Time used: 0.301 (avg: 0.341)	Loss: 5.2979 (avg: 5.3010)	Top1: 0.000 (avg: 0.375)	Top5: 3.125 (avg: 2.021)	
Epoch: 0[100/125]	Time used: 0.288 (avg: 0.331)	Loss: 5.3000 (avg: 5.3004)	Top1: 0.000 (avg: 0.406)	Top5: 0.000 (avg: 1.969)	
Epoch: 0[125/125]	Time used: 0.293 (avg: 0.325)	Loss: 5.2984 (avg: 5.3001)	Top1: 0.000 (avg: 0.375)	Top5: 3.125 (avg: 1.925)	

Validating...
Loss: 5.2989	Top 1 accuracy: 0.550	Top 5 accuracy: 2.250

Training...
Epoch: 1[25/125]	Time used: 0.292 (avg: 0.293)	Loss: 5.2978 (avg: 5.2983)	Top1: 1.562 (avg: 0.500)	Top5: 1.562 (avg: 2.875)	
Epoch: 1[50/125]	Time used: 0.291 (avg: 0.296)	Loss: 5.2956 (avg: 5.2980)	Top1: 1.562 (avg: 0.562)	Top5: 3.125 (avg: 2.781)	
Epoch: 1[75/125]	Time used

In [24]:
# incr_e
incr_list = [64, 128, 192, 256]
incr_top1s = []
incr_top5s = []
incr_batch_times = []
for incr in incr_list:
  model = SqueezeNet_MetaParam(version=1.0, incr=incr)
  batch_time, top1, top5 = test_model(model)
  incr_top1s.append(top1)
  incr_top5s.append(top5)
  incr_batch_times.append(batch_time)
  print("incr_e = {0}: top1 = {1} \t top5 = {2} \t batch time = {3}\n".format(incr, top1, top5, batch_time))
  
torch.cuda.empty_cache()



Training...
Epoch: 0[25/125]	Time used: 0.293 (avg: 0.323)	Loss: 5.3006 (avg: 5.3057)	Top1: 0.000 (avg: 0.125)	Top5: 1.562 (avg: 2.312)	
Epoch: 0[50/125]	Time used: 0.295 (avg: 0.312)	Loss: 5.3003 (avg: 5.3022)	Top1: 0.000 (avg: 0.250)	Top5: 1.562 (avg: 1.875)	
Epoch: 0[75/125]	Time used: 0.299 (avg: 0.309)	Loss: 5.2973 (avg: 5.3010)	Top1: 0.000 (avg: 0.229)	Top5: 6.250 (avg: 1.833)	
Epoch: 0[100/125]	Time used: 0.300 (avg: 0.307)	Loss: 5.2978 (avg: 5.3003)	Top1: 0.000 (avg: 0.266)	Top5: 1.562 (avg: 1.891)	
Epoch: 0[125/125]	Time used: 0.292 (avg: 0.306)	Loss: 5.2976 (avg: 5.3000)	Top1: 0.000 (avg: 0.325)	Top5: 0.000 (avg: 1.913)	

Validating...
Loss: 5.2991	Top 1 accuracy: 0.250	Top 5 accuracy: 2.050

Training...
Epoch: 1[25/125]	Time used: 0.293 (avg: 0.294)	Loss: 5.2966 (avg: 5.2975)	Top1: 0.000 (avg: 0.500)	Top5: 1.562 (avg: 2.188)	
Epoch: 1[50/125]	Time used: 0.289 (avg: 0.297)	Loss: 5.3236 (avg: 5.2965)	Top1: 0.000 (avg: 0.500)	Top5: 1.562 (avg: 2.344)	
Epoch: 1[75/125]	Time used

In [22]:
# freq
freq_list = [1, 3, 4]
freq_top1s = []
freq_top5s = []
freq_batch_times = []
for freq in freq_list:
  model = SqueezeNet_MetaParam(version=1.0, freq=freq)
  batch_time, top1, top5 = test_model(model)
  freq_top1s.append(top1)
  freq_top5s.append(top5)
  freq_batch_times.append(batch_time)
  print("freq = {0}: top1 = {1} \t top5 = {2} \t batch_time = {3}\n".format(freq, top1, top5, batch_time))
  
torch.cuda.empty_cache()



Training...
Epoch: 0[25/125]	Time used: 0.584 (avg: 0.855)	Loss: 5.2971 (avg: 5.3156)	Top1: 1.562 (avg: 0.500)	Top5: 1.562 (avg: 2.312)	
Epoch: 0[50/125]	Time used: 0.586 (avg: 0.721)	Loss: 5.2995 (avg: 5.3079)	Top1: 0.000 (avg: 0.438)	Top5: 1.562 (avg: 2.312)	
Epoch: 0[75/125]	Time used: 0.588 (avg: 0.677)	Loss: 5.2943 (avg: 5.3047)	Top1: 3.125 (avg: 0.417)	Top5: 6.250 (avg: 2.354)	
Epoch: 0[100/125]	Time used: 0.590 (avg: 0.656)	Loss: 5.2997 (avg: 5.3019)	Top1: 0.000 (avg: 0.469)	Top5: 3.125 (avg: 2.469)	
Epoch: 0[125/125]	Time used: 0.586 (avg: 0.644)	Loss: 5.2927 (avg: 5.2973)	Top1: 0.000 (avg: 0.463)	Top5: 1.562 (avg: 2.500)	

Validating...
Loss: 5.3023	Top 1 accuracy: 0.500	Top 5 accuracy: 2.500

Training...
Epoch: 1[25/125]	Time used: 0.594 (avg: 0.581)	Loss: 5.2991 (avg: 5.2702)	Top1: 0.000 (avg: 0.562)	Top5: 1.562 (avg: 2.938)	
Epoch: 1[50/125]	Time used: 0.601 (avg: 0.591)	Loss: 5.3519 (avg: 5.2691)	Top1: 0.000 (avg: 0.719)	Top5: 0.000 (avg: 3.031)	
Epoch: 1[75/125]	Time used

### Downsampling

In [0]:
late_model = SqueezeNet_Late_Pooling(version=1.0)
batch_time, top1, top5 = test_model(late_model)

In [0]:
very_early_model = SqueezeNet_Very_Early_Pooling(version=1.0)
batch_time, top1, top5 = test_model(very_early_model)

In [0]:
semi_early_model = SqueezeNet_Semi_Early_Pooling(version=1.0)
batch_time, top1, top5 = test_model(semi_early_model)