<a href="https://colab.research.google.com/github/sheikmohdimran/Deep_Learning_with_Pytorch/blob/main/Vision/09_Finetune_Compact_Conv_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qqq fastai==2.4.1 timm vit_pytorch

In [2]:
from vit_pytorch.cct import cct_14

In [3]:
model = cct_14(
        img_size=224,
        n_conv_layers=2,
        kernel_size=7,
        stride=2,
        padding=3,
        pooling_kernel_size=3,
        pooling_stride=2,
        pooling_padding=1,
        num_classes=1000,
        positional_embedding='learnable', # ['sine', 'learnable', 'none']  
        )

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [4]:
from torch.hub import load_state_dict_from_url
#state_dict = load_state_dict_from_url('http://ix.cs.uoregon.edu/~alih/compact-transformers/checkpoints/cct14t-7x2_imagenet384_finetune_82.71.pth')
state_dict = load_state_dict_from_url('http://ix.cs.uoregon.edu/~alih/compact-transformers/checkpoints/pretrained/cct_14_7x2_224_imagenet.pth')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [5]:
!wget -q https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz
!wget -q https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat
!wget -q https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat
!tar -xf 102flowers.tgz

In [6]:
import os
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import torch
from torch import nn
from collections import OrderedDict
from tqdm import tqdm
from scipy.io import loadmat
from PIL import Image
from torch import optim

In [7]:
split = loadmat('setid.mat')
label = loadmat('imagelabels.mat')

In [8]:
trnid_df=pd.DataFrame(split['trnid'][0], columns = ['id'])
trnid_df=trnid_df.assign(split='train')
tstid_df=pd.DataFrame(split['tstid'][0], columns = ['id'])
tstid_df=tstid_df.assign(split='test')
valid_df=pd.DataFrame(split['valid'][0], columns = ['id'])
valid_df=valid_df.assign(split='valid')

split_df=trnid_df.append(valid_df, ignore_index=True).append(tstid_df, ignore_index=True)

In [9]:
df = pd.DataFrame(label['labels'][0], columns = ['label'])
df['file'] = df.index
df['file']=df['file'].apply(lambda x: x+1).apply(lambda x: '{0:0>5}'.format(x)).apply(lambda x:'jpg/image_'+str(x)+'.jpg')
df['id'] = df.index
df['id']=df['id'].apply(lambda x: x+1)


In [10]:
df=pd.merge(df, split_df, on="id")
df=df.drop(columns=['id'])
df.head()


Unnamed: 0,label,file,split
0,77,jpg/image_00001.jpg,test
1,77,jpg/image_00002.jpg,test
2,77,jpg/image_00003.jpg,test
3,77,jpg/image_00004.jpg,test
4,77,jpg/image_00005.jpg,test


In [11]:
class CustomImageDataset(Dataset):
    def __init__(self, data_frame, split, transform=None):
        self.img_labels = data_frame[data_frame.split==split]
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        image = read_image(self.img_labels.iloc[idx, 1])
        label = self.img_labels.iloc[idx, 0].astype(np.long)-1
        if self.transform:
            image = self.transform(image)
        return image, label


In [12]:
train_transform=transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

val_transform=transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])


In [13]:
train_data = CustomImageDataset(data_frame=df,split='train',transform=train_transform)
valid_data = CustomImageDataset(data_frame=df,split='valid',transform=val_transform)


In [14]:
trainloader = DataLoader(train_data, batch_size=32, shuffle=True)
validloader = DataLoader(valid_data, batch_size=256, shuffle=False)


In [15]:
for name, module in model.named_children():
  print(name)

tokenizer
classifier


In [16]:
for name, module in model.tokenizer.named_children():
  print(name)

conv_layers
flattener


In [17]:
for name, module in model.classifier.named_children():
  print(name)

attention_pool
dropout
blocks
norm
fc


In [18]:
model.classifier.fc

Linear(in_features=384, out_features=1000, bias=True)

In [19]:
for name, module in model.classifier.named_children():
  for param in module.parameters():
      param.requires_grad = False


for name, module in model.tokenizer.named_children():
  for param in module.parameters():
      param.requires_grad = False


In [20]:
model.classifier.fc = nn.Sequential(OrderedDict([
          ('lin1', nn.Linear(384,256)),
          ('relu1', nn.ReLU()),
          ('lin2', nn.Linear(256,102))
        ]))

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [22]:
from fastai.vision.all import *
criterion = nn.CrossEntropyLoss()
data = DataLoaders(trainloader, validloader)
learn = Learner(data, model, loss_func=criterion, opt_func=Adam, metrics=accuracy)

In [23]:
learn.fit(40)

epoch,train_loss,valid_loss,accuracy,time
0,4.362496,3.731829,0.320588,00:20
1,3.587435,2.486192,0.464706,00:19
2,2.760263,1.747185,0.568627,00:18
3,2.126798,1.366907,0.691176,00:18
4,1.681853,1.208445,0.693137,00:18
5,1.38007,1.01733,0.742157,00:18
6,1.153383,0.946245,0.769608,00:18
7,0.949313,0.871244,0.770588,00:19
8,0.819531,0.792425,0.796078,00:18
9,0.701835,0.743024,0.8,00:18
