<a href="https://colab.research.google.com/github/saivarshittha/fine-grained-image-classification/blob/main/CUBB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive
!ls

Mounted at /gdrive
/gdrive
'My Drive'


In [2]:
%cd /gdrive/My\ Drive
# !ls

/gdrive/My Drive


In [3]:
#!tar -zxvf CUB_200_2011.tgz

In [4]:
batch_size = 28

In [5]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
from torchvision.datasets.folder import default_loader
from torchvision.datasets.utils import download_url
from torch.utils.data import Dataset,DataLoader


### CPU vs GPU

In [6]:
# Find the device available to use using torch library
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device specified above
# model.to(device)

# Dataset Class

In [7]:
import os
import pandas as pd
from torchvision.datasets.folder import default_loader
from torchvision.datasets.utils import download_url
from torch.utils.data import Dataset


class Cub2011(Dataset):
    base_folder = 'CUB_200_2011/images'
    url = 'http://www.vision.caltech.edu/visipedia-data/CUB-200-2011/CUB_200_2011.tgz'
    filename = 'CUB_200_2011.tgz'
    tgz_md5 = '97eceeb196236b17998738112f37df78'

    def __init__(self, root, train=True, loader=default_loader, download=False):
        self.root = os.path.expanduser(root)
        self.transform = transforms.Compose([transforms.Resize(255), 
                                       transforms.CenterCrop(224),  
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(), 
                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) 
        self.loader = default_loader
        self.train = train
        self._load_metadata()

    def _load_metadata(self):
        images = pd.read_csv(os.path.join(self.root, 'CUB_200_2011', 'images.txt'), sep=' ',
                             names=['img_id', 'filepath'])
        image_class_labels = pd.read_csv(os.path.join(self.root, 'CUB_200_2011', 'image_class_labels.txt'),
                                         sep=' ', names=['img_id', 'target'])
        train_test_split = pd.read_csv(os.path.join(self.root, 'CUB_200_2011', 'train_test_split.txt'),
                                       sep=' ', names=['img_id', 'is_training_img']) 

        data = images.merge(image_class_labels, on='img_id')

        self.data = data.merge(train_test_split, on='img_id')

        if self.train:
            self.data = self.data[self.data.is_training_img == 1]
        else:
            self.data = self.data[self.data.is_training_img == 0]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        path = os.path.join(self.root, self.base_folder, sample.filepath)
        target = sample.target - 1  # Targets start at 1 by default, so shift to 0
        img = self.loader(path)

        if self.transform is not None:
            img = self.transform(img)

        return img, target

In [8]:
train_dataset = Cub2011('/gdrive/My Drive')

In [9]:
len(train_dataset)

5994

In [10]:
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=28)
batch = next(iter(train_loader))

In [11]:
batch[0].size()

torch.Size([28, 3, 224, 224])

## Net

In [12]:
class Encoder(nn.Module):
    """
    Encoder.
    """

    def __init__(self, encoded_image_size=14):
        super(Encoder, self).__init__()
        self.enc_image_size = encoded_image_size

        resnet = torchvision.models.resnet101(pretrained=True)  # pretrained ImageNet ResNet-101

        # Remove linear and pool layers (since we're not doing classification)
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        modules1 = list(resnet.children())[:-4]
        self.resnet1 = nn.Sequential(*modules1)

        #self.drop1 = nn.Dropout2d(p=0.5)
        #self.conv1d = nn.Conv2d(2048,1024, (1,1), stride=1)
        self.conv1d = nn.Conv2d(2048,256, 1, stride=1,dilation = 1,padding =0)
        self.conv1d_1 = nn.Conv2d(2048,256, 1, stride=1,dilation = 1,padding = 0)
        self.conv1d_2 = nn.Conv2d(2048,256, 1, stride=1,dilation = 1,padding = 0)  # all above 3 are same?

        self.conv2d_1 = nn.Conv2d(256,256, 3, stride=1,dilation = 1,padding = 1)
        self.conv2d_2 = nn.Conv2d(256,256, 5, stride=1,dilation = 1,padding = 2)
        self.conv2d_3 = nn.Conv2d(256,256, 1, stride=1,dilation = 1,padding = 0)

        self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((7,7)),nn.Conv2d(2048, 256,1,stride=1,dilation = 1,padding =0),nn.BatchNorm2d(256),nn.ReLU())
        ## converted (8,8)->(7,7)

        self.drop2 = nn.Dropout2d(p=0.3)

        self.conv1_2d = nn.Conv2d(1024,512,1 , stride=1,dilation = 1,padding =0)
        self.bn1 = nn.BatchNorm2d(1024)
        self.relu = nn.ReLU()

        self.drop3 = nn.Dropout2d(p=0.5)

        self.conv2_1d = nn.Conv2d(512,512, 1, stride=1,dilation = 1,padding =0)
        self.conv2_2d = nn.Conv2d(1024,512, 3, stride=1,dilation = 1,padding =1)

        # Resize image to fixed size to allow input images of variable size
        #self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))

        self.fine_tune()

    def forward(self, images):
        """
        Forward propagation.
        :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
        :return: encoded images

        """
        out = self.resnet(images)  # (batch_size, 2048, image_size/32, image_size/32)
        out1 = self.resnet1(images)
        print("out size = ",out.shape)  # (batch_size, 2048, image_size/32, image_size/32)  
        # print(out1.shape)
        #out = F.interpolate(out, size=32, mode='bilinear', align_corners=True)
        #x = torch.cat((out, out1), dim=-3)

        #out = self.adaptive_pool(out)  # (batch_size, 2048, encoded_image_size, encoded_image_size)
        x_1 = self.conv1d(out)
        x_2 = self.conv1d_1(out)
        x_3 = self.conv1d_2(out)

        x_2_1 = self.conv2d_1(x_2)
        x_3_1 = self.conv2d_2(x_3)
        print(x_1.shape, x_2.shape, x_3.shape)
        #x_4 = self.conv1d_3(out)

        x_5 = self.global_avg_pool(out)
        print("shape of x_5 = ",x_5.shape)
        x_5_1 = self.conv2d_3(x_5)
        #x_6 = F.interpolate(x_5, size=8, mode='bilinear', align_corners=True)

        print(x_1.shape,x_2_1.shape,x_3_1.shape,x_5_1.shape)
        x_6 = torch.cat((x_1, x_2_1, x_3_1, x_5_1), dim=-3)
        print("shape of x_6 = ",x_6.shape)
        x_7 = self.conv1_2d(x_6)
        print("shape of x_7 = ",x_7.shape)
        x_8 = F.interpolate(x_7, size=28, mode='bilinear', align_corners=True) ## changed from 32 to 28
        #print(x_7.shape)   
        print("shape of x_8 = ",x_8.shape)
        x_9 = self.conv2_1d(out1)
        print("shape of x_9 = ",x_9.shape)
        y = torch.cat((x_8, x_9), dim=-3)   
        #print(out.shape, x_1.shape, x_2.shape, x_3.shape,x_5.shape, x_5_1.shape, x_6.shape,x_8.shape, x_9.shape,y.shape)  
        #y = self.drop2(z)
        #y = self.conv1_2d(y)
        y = self.bn1(y)
        y = self.relu(y)
        y = self.drop3(y)
        y = self.conv2_2d(y)
        #print(out.shape, out1.shape, x_6.shape,x_8.shape, x_9.shape,y.shape)
        print("y shape before = ",y.shape)
        y = y.permute(0, 2, 3, 1)  # (batch_size, encoded_image_size, encoded_image_size, 2048)
        print("shape of y after =",y.shape)
        #print (y.shape)CIFAR10(ro
        return y
    def fine_tune(self, fine_tune=True):
        """
        Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder.

        :param fine_tune: Allow?
        """
        for p in self.resnet.parameters():
            p.requires_grad = False
        # If fine-tuning, only fine-tune convolutional blocks 2 through 4
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune



In [13]:
net = Encoder()

Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


HBox(children=(FloatProgress(value=0.0, max=178728960.0), HTML(value='')))




# Training

In [14]:
@torch.no_grad()
def get_all_preds(model, loader):
    all_preds = torch.tensor([])
    for batch in loader:
        images, labels = batch

        preds = model(images)
        all_preds = torch.cat(
            (all_preds, preds)
            ,dim=0
        )
    return all_preds

In [15]:
# with torch.no_grad():
#   train_preds = get_all_preds(net,train_loader)

In [16]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [17]:
for epoch in range(2):
  running_loss = 0.0
  for i,data in enumerate(train_loader,0):
    images,labels = data
    
    preds = net(images)
    # labels=labels.unsqueeze(dim=0)
    optimizer.zero_grad()
    print("shape of preds",preds.size())
    print("labels size",labels.size())
    # print(labels)
    loss = criterion(preds,labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if(i%2000 ==1999):
      print("xyz")
print("done")



out size =  torch.Size([28, 2048, 7, 7])
torch.Size([28, 256, 7, 7]) torch.Size([28, 256, 7, 7]) torch.Size([28, 256, 7, 7])
shape of x_5 =  torch.Size([28, 256, 7, 7])
torch.Size([28, 256, 7, 7]) torch.Size([28, 256, 7, 7]) torch.Size([28, 256, 7, 7]) torch.Size([28, 256, 7, 7])
shape of x_6 =  torch.Size([28, 1024, 7, 7])
shape of x_7 =  torch.Size([28, 512, 7, 7])
shape of x_8 =  torch.Size([28, 512, 28, 28])
shape of x_9 =  torch.Size([28, 512, 28, 28])
y shape before =  torch.Size([28, 512, 28, 28])
shape of y after = torch.Size([28, 28, 28, 512])
shape of preds torch.Size([28, 28, 28, 512])
labels size torch.Size([28])


RuntimeError: ignored

In [None]:
# for epoch in range(2):
#   running_loss = 0.0
#   for i,data in enumerate(train_loader,0):
#     images,labels = data
#     optimizer.zero_grad()
#     preds = net(images)
#     print("shape of preds",preds.size())
#     print(labels.size())
#     # print(labels)
#     loss = criterion(preds,labels.view(-1))
#     loss.backward()
#     optimizer.step()

#     running_loss += loss.item()
#     if(i%2000 ==1999):
#       print("xyz")
# print("done")


In [None]:
preds

In [None]:
# for epoch in range(2):
#   for batch in train_loader:
#     images,labels = batch
#     preds = net(images)
#     # loss = F.cross_entropy(preds,labels)
#     optimiser.zero_grad()
#     loss.backward()
#     optimizer.step()
#     all_preds = torch.cat(all_preds,preds,dim = 0)
#   print("epoch")
  