## Import Libraries

In [1]:
import os
import time
import random

import timm
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
from collections import OrderedDict

import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torchvision import models
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset

## Hyper parameters

In [2]:
# os.environ['CUDA_VISIBLE_DEVICES']='0'
device = "cpu" 
# device = "cuda" if torch.cuda.is_available() else "cpu"
num_epochs = 1
batch_size = 32
lr = 3e-4
gamma = 0.7
unfreeze_after=2
lr_decay=.8
lmbd = 8

## Load Data

In [3]:
transform_train_list = [
    transforms.Resize((224,224), interpolation=3),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
transform_val_list = [
    transforms.Resize(size=(224,224),interpolation=3), #Image.BICUBIC
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
data_transforms = {
'train': transforms.Compose( transform_train_list ),
'val': transforms.Compose(transform_val_list),
}

  "Argument interpolation should be of type InterpolationMode instead of int. "


In [4]:
image_datasets = {}
data_dir = "/home/shubham/CVP/data"

image_datasets['train'] = datasets.ImageFolder(os.path.join(data_dir, 'train'),
                                          data_transforms['train'])
# image_datasets['val'] = datasets.ImageFolder(os.path.join(data_dir, 'val'),
#                                           data_transforms['val'])
train_loader = DataLoader(dataset = image_datasets['train'], batch_size=batch_size, shuffle=True )
# valid_loader = DataLoader(dataset = image_datasets['val'], batch_size=batch_size, shuffle=True)
# dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=opt.batchsize,
#                                              shuffle=True, num_workers=8, pin_memory=True) # 8 workers may work faster
#               for x in ['train', 'val']}
# dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes # '001','003', etc
print(len(class_names))

62


In [5]:
# x,y=next(iter(train_loader))
# print(x.shape, y.shape)

## Model

In [6]:
# weights initialization
def weights_init_kaiming(m):
    classname = m.__class__.__name__
    # print(classname)
    if classname.find('Conv') != -1:
        init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') # For old pytorch, you may use kaiming_normal.
    elif classname.find('Linear') != -1:
        init.kaiming_normal_(m.weight.data, a=0, mode='fan_out')
        init.constant_(m.bias.data, 0.0)
    elif classname.find('BatchNorm1d') != -1:
        init.normal_(m.weight.data, 1.0, 0.02)
        init.constant_(m.bias.data, 0.0)
        
def weights_init_classifier(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        init.normal_(m.weight.data, std=0.001)
        init.constant_(m.bias.data, 0.0)

In [7]:
class FC_Classifier(nn.Module):
    def __init__(self, input_dim, num_classes, droprate=0.5, num_bottleneck=256, return_features=False):
        super(FC_Classifier, self).__init__()
        self.return_features = return_features
        add_block = []
        # if linear:
        add_block += [nn.Linear(input_dim, num_bottleneck)]
        # else:
        #    num_bottleneck = input_dim
        # if bnorm:
        add_block += [nn.BatchNorm1d(num_bottleneck)]
        # if relu:
        #    add_block += [nn.LeakyReLU(0.1)]
        # if droprate>0:
        add_block += [nn.Dropout(p=droprate)]
        add_block = nn.Sequential(*add_block)
        add_block.apply(weights_init_kaiming)

        classifier = []
        classifier += [nn.Linear(num_bottleneck, num_classes)]
        classifier = nn.Sequential(*classifier)
        classifier.apply(weights_init_classifier)

        self.add_block = add_block
        self.classifier = classifier

    def forward(self, x):
        x = self.add_block(x)
        if self.return_features:
            f = x
            x = self.classifier(x)
            return [x,f]
        else:
            x = self.classifier(x)
            return x

In [8]:
class LATransformer(nn.Module):
    def __init__(self, ViT, lmbd, num_classes=751):
        super(LATransformer, self).__init__()
        self.class_num = num_classes # output number of classes
        
        # ViT model
        self.model = ViT
        self.model.head.requires_grad_ = False 
        self.cls_token = self.model.cls_token # 1, 1, 768
        self.pos_embed = self.model.pos_embed # 1, 197, 768

        # these are ViT model internal hyper-parameters (FIXED) 
        # self.num_blocks = 12 # number of sequential blocks in ViT
        
        # there are 196 patches in each image; thus, we split them into 14 x 14 grid
        self.num_rows = 14 
        self.num_cols = 14

        # Locally aware network
        self.avgpool = nn.AdaptiveAvgPool2d((self.num_rows,768))
        self.lmbd = lmbd

        # ensemble of classifiers
        for i in range(self.num_rows):
            name = 'classifier'+str(i)
            setattr(self, name, FC_Classifier(input_dim=768, num_classes=self.class_num, droprate=0.5, num_bottleneck=256, return_features=False))

    def forward(self, x):
        # x shape = 32, 3, 224, 224
        
        # Divide input image into patch embeddings and add position embeddings
        x = self.model.patch_embed(x) # 32, 196, 768
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # 32, 1, 768
        x = torch.cat((cls_token, x), dim=1) # 32, 197, 768
        trnsfrmr_inp = self.model.pos_drop(x + self.pos_embed) # dropout with p = 0; idk!
        
        # Feed forward the x = (patch_embeddings+position_embeddings) through transformer blocks
        # for i in range(self.num_blocks):
        # x = self.model.blocks[i](x)
        x = self.model.blocks(trnsfrmr_inp)
        x_trnsfrmr_encdd = self.model.norm(x) # layer normalization; shape = 32, 197, 768
        
        # extract the cls token
        cls_token_out = x_trnsfrmr_encdd[:, 0].unsqueeze(1)
        
        # Average pool
        Q = x_trnsfrmr_encdd[:, 1:]
        L = self.avgpool(Q) # 32, 14, 768
        
        # Add global cls token to each local token 
        for i in range(self.num_rows):
            out = torch.mul(L[:, i, :], self.lmbd)
            L[:,i,:] = torch.div(torch.add(cls_token_out.squeeze(),out), 1+self.lmbd)
        
        # Locally aware network
        part = {}
        predict = {}
        for i in range(self.num_rows):
            part[i] = L[:,i,:] # 32, 768
            name = 'classifier'+str(i)
            c = getattr(self, name)
            predict[i] = c(part[i]) # 32, 751
        return predict

## Load Model

In [9]:
# Load pre-trained ViT
vit_base = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=751)
vit_base = vit_base.to(device)
vit_base.eval()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn

In [10]:
# print(torch.cuda.is_available())
# print(type(vit_base))

In [11]:
# Create LA Transformer
model = LATransformer(ViT=vit_base, lmbd=lmbd, num_classes=len(class_names)).to(device)
print(model.eval())

LATransformer(
  (model): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU()
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
      

### Utilities

In [13]:
# utilities
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def freeze_all_blocks(model):
    frozen_blocks = 12
    for block in model.model.blocks[:frozen_blocks]:
        for param in block.parameters():
            param.requires_grad=False

def unfreeze_blocks(model, amount= 1):
    for block in model.model.blocks[11-amount:]:
        for param in block.parameters():
            param.requires_grad=True
    return model

def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
    rowd = OrderedDict(epoch=epoch)
    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
    with open(filename, mode='a') as cf:
        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
        if write_header:  # first iteration (epoch == 1 can't be used)
            dw.writeheader()
        dw.writerow(rowd)
        
def save_network(network, name):
    save_filename = "_best.pth"
    save_path = os.path.join('/home/shubham/CVP/model/', name + save_filename)
    torch.save(network.cpu().state_dict(), save_path)
    
    if torch.cuda.is_available():
        network.cuda()

##  Train

In [16]:
def train_one_epoch(
        epoch, model, loader, optimizer, loss_fn,
        lr_scheduler=None, saver=None, output_dir='', 
        loss_scaler=None, model_ema=None, mixup_fn=None):
    
    batch_time_m = AverageMeter()
    data_time_m = AverageMeter()
    losses_m = AverageMeter()

    model.train()
    epoch_accuracy = 0
    epoch_loss = 0
    end = time.time()
    last_idx = len(loader) - 1
    num_updates = epoch * len(loader)
    running_loss = 0.0
    running_corrects = 0.0

    for data, target in tqdm(loader):
        data, target = data.to(device), target.to(device)

            
        data_time_m.update(time.time() - end)

        optimizer.zero_grad()
        output = model(data)
        score = 0.0
        sm = nn.Softmax(dim=1)
        for k, v in output.items():
            score += sm(output[k])
        _, preds = torch.max(score.data, 1)
        
        loss = 0.0
        for k,v in output.items():
            loss += loss_fn(output[k], target)
        loss.backward()

        optimizer.step()

        batch_time_m.update(time.time() - end)
        
        # print(preds, target.data)
        acc = (preds == target.data).float().mean()
        
        # print(acc)
        epoch_loss += loss/len(loader)
        epoch_accuracy += acc / len(loader)
        # if acc:
        #     print(acc, epreds, target.data)
        print(
    f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f}", end="\r")

    print()

    return OrderedDict([('train_loss', epoch_loss.data.item()), ("train_accuracy", epoch_accuracy.data.item())])


In [14]:
# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = optim.Adam(model.parameters(),weight_decay=5e-4, lr=lr)

# scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)
freeze_all_blocks(model)

NameError: name 'model' is not defined

In [None]:
best_acc = 0.0
y_loss = {} # loss history
y_loss['train'] = []
y_loss['val'] = []
y_err = {}
y_err['train'] = []
y_err['val'] = []
print("training...")
output_dir = ""
best_acc = 0
name = "la-tranformer"

try:
    os.mkdir("/home/shubham/CVP/LA-Transformer/model/" + name)

except:
    pass
output_dir = "/home/shubham/CVP/LA-Transformer/model/" + name
unfrozen_blocks = 0

for epoch in range(10):

    if epoch%unfreeze_after==0:
        unfrozen_blocks += 1
        model = unfreeze_blocks(model, unfrozen_blocks)
        optimizer.param_groups[0]['lr'] *= lr_decay 
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print("Unfrozen Blocks: {}, Current lr: {}, Trainable Params: {}".format(unfrozen_blocks, 
                                                                             optimizer.param_groups[0]['lr'], 
                                                                             trainable_params))

    train_metrics = train_one_epoch(
        epoch, model, train_loader, optimizer, criterion,
        lr_scheduler=None, saver=None)

#     eval_metrics = validate(model, valid_loader, criterion)


    # update summary
#     update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
#                    write_header=True)

    # deep copy the model
#     last_model_wts = model.state_dict()
#     if eval_metrics['val_accuracy'] > best_acc:
#         best_acc = eval_metrics['val_accuracy']
#         save_network(model, epoch,name)
#         print("SAVED!")

training...
Unfrozen Blocks: 1, Current lr: 0.000192, Trainable Params: 20962817


  0%|          | 0/31 [00:00<?, ?it/s]

Epoch : 1 - loss : 84.7825 - acc: 0.5665


  0%|          | 0/31 [00:00<?, ?it/s]

Epoch : 2 - loss : 77.3638 - acc: 0.6119
Unfrozen Blocks: 2, Current lr: 0.00015360000000000002, Trainable Params: 28050689


  0%|          | 0/31 [00:00<?, ?it/s]

Epoch : 3 - loss : 59.2743 - acc: 0.5907

In [14]:
save_network(model, name)

### Appendix

In [None]:
def validate(model, loader, loss_fn):
    batch_time_m = AverageMeter()
    losses_m = AverageMeter()
    top1_m = AverageMeter()
    top5_m = AverageMeter()

    model.eval()
    epoch_accuracy = 0
    epoch_loss = 0
    end = time.time()
    last_idx = len(loader) - 1
    
    running_loss = 0.0
    running_corrects = 0.0

    with torch.no_grad():
        for input, target in tqdm(loader):

            input, target = input.to(device), target.to(device)
            
            output = model(input)
            
            score = 0.0
            sm = nn.Softmax(dim=1)
            for k, v in output.items():
                score += sm(output[k])
            _, preds = torch.max(score.data, 1)

            loss = 0.0
            for k,v in output.items():
                loss += loss_fn(output[k], target)


            batch_time_m.update(time.time() - end)
            acc = (preds == target.data).float().mean()
            epoch_loss += loss/len(loader)
            epoch_accuracy += acc / len(loader)
            
            print(f"Epoch : {epoch+1} - val_loss : {epoch_loss:.4f} - val_acc: {epoch_accuracy:.4f}", end="\r")
    print()    
    metrics = OrderedDict([('val_loss', epoch_loss.data.item()), ("val_accuracy", epoch_accuracy.data.item())])


    return metrics

In [4]:
# vit_base

In [5]:
# x,y = next(iter(train_loader))
# print(x.shape, y.shape)

In [6]:
# print(x.shape)
# x = vit_base.patch_embed(x)
# print(x.shape)
# print()

# print(vit_base.cls_token.shape, vit_base.pos_embed.shape)
# cls_token = vit_base.cls_token.expand(x.shape[0], -1, -1) 
# print(cls_token.shape)
# x = torch.cat((cls_token, x), dim=1)
# print(x.shape)
# x = vit_base.pos_drop(x + vit_base.pos_embed)
# print(x.shape)
# print()

# # Feed forward the x = (patch_embeddings+position_embeddings) through transformer blocks
# # for i in range(12):
# x = vit_base.blocks(x)
# x = vit_base.norm(x) # layer normalization
# print(x.shape)

In [7]:
# # extract the cls token
# cls_token_out = x[:, 0].unsqueeze(1)
# print(cls_token_out.shape)

# # Average pool
# avgpool = nn.AdaptiveAvgPool2d(output_size = (14, 768))
# print(x.shape)
# x = avgpool(x[:, 1:]) # input is 32,196,768
# print(x.shape)