In [None]:
import math
import gc
import shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torchvision
import torchvision.transforms as ttf



import os
import os.path as osp

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np



In [None]:
!pip install timm

Collecting timm
  Downloading timm-0.5.4-py3-none-any.whl (431 kB)
[?25l[K     |▊                               | 10 kB 25.0 MB/s eta 0:00:01[K     |█▌                              | 20 kB 28.6 MB/s eta 0:00:01[K     |██▎                             | 30 kB 30.9 MB/s eta 0:00:01[K     |███                             | 40 kB 34.3 MB/s eta 0:00:01[K     |███▉                            | 51 kB 7.1 MB/s eta 0:00:01[K     |████▋                           | 61 kB 8.4 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 9.5 MB/s eta 0:00:01[K     |██████                          | 81 kB 7.5 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 8.3 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 9.1 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 9.1 MB/s eta 0:00:01[K     |█████████▏                      | 122 kB 9.1 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 9.1 MB/s eta 0:00:01[K     

In [None]:
from timm.models.layers import trunc_normal_, DropPath
from timm.models.registry import register_model

In [None]:
print(torch.__version__)

1.10.0+cu111


## Downloading data

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

# writing kaggle username & key to the json file
with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"shrikiransrinivasan","key":"8d8c6f44ec36fd64dd50f328913ac2a7"}')

# chmod command controls who can do what to a file or directory in linux
!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[?25l[K     |█████▌                          | 10 kB 32.7 MB/s eta 0:00:01[K     |███████████                     | 20 kB 40.0 MB/s eta 0:00:01[K     |████████████████▋               | 30 kB 13.1 MB/s eta 0:00:01[K     |██████████████████████▏         | 40 kB 6.0 MB/s eta 0:00:01[K     |███████████████████████████▊    | 51 kB 7.1 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 4.4 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73275 sha256=a493a606ddb3de1678b8084d229d0bdfe9925852911874e0315b286dc46dd097
  Stored in directory: /root/.cache/pip/wheels/de/f7/d8/c3902cacb7e62cb611b1ad343d7cc07f42f7eb76ae3a52f3d1
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
  

In [None]:
!pwd


/content


In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)
# drive._mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!kaggle competitions download -c 11-785-s22-hw2p2-classification
!kaggle competitions download -c 11-785-s22-hw2p2-verification

!unzip -q 11-785-s22-hw2p2-classification.zip
!unzip -q 11-785-s22-hw2p2-verification.zip

!ls

Downloading 11-785-s22-hw2p2-classification.zip to /content
100% 2.35G/2.35G [00:29<00:00, 85.0MB/s]
100% 2.35G/2.35G [00:29<00:00, 86.6MB/s]
Downloading 11-785-s22-hw2p2-verification.zip to /content
100% 263M/263M [00:03<00:00, 94.9MB/s]
100% 263M/263M [00:03<00:00, 84.7MB/s]
11-785-s22-hw2p2-classification.zip   sample_data
11-785-s22-hw2p2-verification.zip     train_subset
classification			      verification
classification_sample_submission.csv  verification_sample_submission.csv
drive


## Setting Hyperparameters

In [None]:
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
batch_size = 200
lr = 0.1
n_epochs = 70 # Just for the early submission. We'd want you to train like 50 epochs for your main submissions.

In [None]:
!nvidia-smi

Thu Mar 10 02:57:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    37W / 250W |  16151MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## The Network

Normal Convnet class

In [None]:
class Network(nn.Module):
  
    def __init__(self, num_classes=7000):
        super().__init__()

        self.backbone = nn.Sequential(

            # TODO: Conv group 1
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(7,7), stride=(4,4)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            # TODO: Conv group 2
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(2,2)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            # TODO: Conv group 3
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3), stride=(2,2)),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            # TODO: Conv group 4
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(2,2)),
            nn.BatchNorm2d(512),
            nn.ReLU(),

            # TODO: Average pool over & reduce the spatial dimensions to (1, 1)
            # nn.AvgPool2d((6,6)), --> this works too, but have to do calculations to find (6,6)
            nn.AdaptiveAvgPool2d([1,1]),

            # TODO: Collapse (Flatten) the trivial (1, 1) dimensions
            nn.Flatten()
            ) 
        
        self.cls_layer = nn.Linear(512, num_classes)
    
    def forward(self, x, return_feats=False):
        """
        What is return_feats? It essentially returns the second-to-last-layer
        features of a given image. It's a "feature encoding" of the input image,
        and you can use it for the verification task. You would use the outputs
        of the final classification layer for the classification task.

        You might also find that the classification outputs are sometimes better
        for verification too - try both.
        """
        feats = self.backbone(x)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

Residual block

In [None]:
class InvertedResidualBlock(nn.Module):
    
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False
        
        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        self.feature_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(),
        )

        self.spatial_mixing = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1,
                      stride=stride, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(),
        )

        self.bottleneck_channels = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(out_channels),
        )

    def forward(self, x):
        out = self.feature_mixing(x) # pointwise convolutions, sudden increase in number of channels
        out = self.spatial_mixing(out) # depthwise convolutions, no of channels remains the same
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out # add input to output if no of channels remains the same
        else:
            return out

MobileNet class

In [None]:
class MobileNetV2(nn.Module):

    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes

        self.stem = nn.Sequential(
            # TODO: Fill this in!
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 32, kernel_size=1, padding=1, groups=32, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 16, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(16),
            
              
        )

        """
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,  24, 2, 2],
            [6,  32, 3, 2],
            [6,  64, 4, 2],
            [6,  96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 16

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            
            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1, 
                    expand_ratio=expand_ratio
                ))
                # print("in: " + str(in_channels))
                # print("out: " + str(out_channels))
                # print("\n")
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels, 1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6()
        )

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            # TODO: Fill this in!
            # Pool over & collapse the spatial dimensions to (1, 1)
            # Collapse the trivial (1, 1) dimensions
            # Project to our # of classes
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten()
        )

        # Separating linear layer so features can be extracted for verification
        self.lin = nn.Linear(1280, num_classes)

        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        feats = self.cls_layer(out)
        out = self.lin(feats)

        if return_feats:
            return feats
        else:
            return out

In [None]:
model = MobileNetV2() #initializing an instance of Network class
model.cuda() 
summary(model, (3, 224, 224))

in: 16
out: 24


in: 24
out: 24


in: 24
out: 32


in: 32
out: 32


in: 32
out: 32


in: 32
out: 64


in: 64
out: 64


in: 64
out: 64


in: 64
out: 64


in: 64
out: 96


in: 96
out: 96


in: 96
out: 96


in: 96
out: 160


in: 160
out: 160


in: 160
out: 160


in: 160
out: 320


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
             ReLU6-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 114, 114]              32
       BatchNorm2d-5         [-1, 32, 114, 114]              64
             ReLU6-6         [-1, 32, 114, 114]               0
            Conv2d-7         [-1, 16, 114, 114]             512
       BatchNorm2d-8         [-1, 16, 114, 114]              32
            Conv2d-9         [-1, 96, 114, 114]           1,536
      BatchNorm2

ConvNext block

In [None]:
class Block(nn.Module):
  def __init__(self, in_channels, drop_path):
    super().__init__()
    self.dwconv = nn.Conv2d(in_channels,in_channels,kernel_size=7, padding=3, groups=in_channels)
    self.bn = nn.BatchNorm2d(in_channels)
    self.pwconv1 = nn.Conv2d(in_channels, 4*in_channels, kernel_size=1, stride=1)
    self.act = nn.GELU()
    self.pwconv2 = nn.Conv2d(4*in_channels, in_channels, kernel_size=1, stride=1)
    self.drop_path = DropPath(drop_path) if drop_path > 0 else nn.Identity()

  def forward(self, x):
    input = x
    out = self.dwconv(x)
    out = self.bn(out)
    out = self.pwconv1(out)
    out = self.act(out)
    out = self.pwconv2(out)
    out = input + self.drop_path(out)
    return out

In [None]:
class ConvNext(nn.Module):
  def __init__(self, in_channels, num_classes=7000, depths=[3,3,9,3], 
               dims=[96, 192, 384, 758], drop_path_rate=0.0):
    super().__init__()
    self.down_sample_layers = nn.ModuleList()
    stem = nn.Sequential(nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4),
                         nn.BatchNorm2d(dims[0]))
    self.down_sample_layers.append(stem)

    for i in range(3):
      down_sample_layer = nn.Sequential(
          nn.BatchNorm2d(dims[i]),
          nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2)
      )
      self.down_sample_layers.append(down_sample_layer)

      dp_rates = [x.item() for x in np.linspace(drop_path_rate, 0.0, sum(depths) )]
      self.block_layers = nn.ModuleList()

      for i in range(4):
        blocks = nn.ModuleList()
        for j in range(depths[i]):
          block = Block(dims[i], dp_rates[sum(depths[:i]) + j])
          blocks.append(block)
        self.block_layers.append(nn.Sequential(*blocks))

    self.norm = nn.BatchNorm2d(dims[-1])
    self.classifier = nn.Linear(dims[-1], num_classes)

  def forward(self, x, return_features=False):
    for i in range(4):
      x = self.down_sample_layers[i](x)
      x = self.block_layers[i](x)

    x = self.norm(x)
    x = F.adaptive_avg_pool2d(x,1)
    feats = x.view(x.size(0), -1)
    x = self.classifier(feats)
    
    out = torch.nn.functional.gelu(torch.nn.functional.gelu(torch.nn.functional.gelu(torch.nn.functional.gelu(x))))

    if return_features:
      return out
    else:
      return x
    

In [None]:
model = ConvNext(in_channels=3) #initializing an instance of Network class
model.cuda() 
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 56, 56]           4,704
       BatchNorm2d-2           [-1, 96, 56, 56]             192
            Conv2d-3           [-1, 96, 56, 56]           4,800
       BatchNorm2d-4           [-1, 96, 56, 56]             192
            Conv2d-5          [-1, 384, 56, 56]          37,248
              GELU-6          [-1, 384, 56, 56]               0
            Conv2d-7           [-1, 96, 56, 56]          36,960
          Identity-8           [-1, 96, 56, 56]               0
             Block-9           [-1, 96, 56, 56]               0
           Conv2d-10           [-1, 96, 56, 56]           4,800
      BatchNorm2d-11           [-1, 96, 56, 56]             192
           Conv2d-12          [-1, 384, 56, 56]          37,248
             GELU-13          [-1, 384, 56, 56]               0
           Conv2d-14           [-1, 96,

In [None]:
class ConvNextBlock(nn.Module):
    
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False
        
        hidden_dim = in_channels * expand_ratio

        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=7, padding=3,
                      stride=stride, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
            # nn.ReLU6(),
            
        )

        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(hidden_dim),
            # nn.ReLU6(),
            nn.GELU(),
        )

        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(hidden_dim, in_channels, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(hidden_dim),
            # nn.ReLU6(),
            
        )


    def forward(self, x):
        out = self.spatial_mixing(x) # depthwise convolutions, no of channels remains the same
        out = self.feature_mixing(out) # pointwise convolutions, sudden increase in number of channels     
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out # add input to output if no of channels remains the same
        else:
            return out

            """
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """

ConvNextT class

In [None]:
class ConvNextT(nn.Module):

    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes

        self.stem = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=4, stride=4, padding=0, bias=False),
            nn.BatchNorm2d(96),
            # nn.ReLU6(),      
        )

        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [4,  96, 3, 1],
            [4,  192, 3, 1],
            [4,  384, 9, 1],
            [4,  768, 3, 1],
        ]

        in_channels = 96

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            
            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(ConvNextBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1, 
                    expand_ratio=expand_ratio
                ))
                print("in: " + str(in_channels))
                print("out: " + str(out_channels))
                print("\n")
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten()
        )

        # Separating linear layer so features can be extracted for verification
        # self.lin = nn.Linear(1280, num_classes)
        self.lin = nn.Linear(in_channels, num_classes)
        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.layers(out)
        # out = self.final_block(out)
        feats = self.cls_layer(out)
        out = self.lin(feats)

        if return_feats:
            return feats
        else:
            return out

## Loading data and data augmentation

In [None]:
model = ConvNextT() #initializing an instance of Network class
model.cuda() 
summary(model, (3, 224, 224))

in: 96
out: 96


in: 96
out: 96


in: 96
out: 96


in: 96
out: 192


in: 192
out: 192


in: 192
out: 192


in: 192
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 384


in: 384
out: 768


in: 768
out: 768


in: 768
out: 768




RuntimeError: ignored

In [None]:

"""
Transforms (data augmentation) is quite important for this task.
Go explore https://pytorch.org/vision/stable/transforms.html for more details
"""
DATA_DIR = "/content"
# TRAIN_DIR = osp.join(DATA_DIR, "train_subset/train_subset") # This is a smaller subset of the data. Should change this to classification/classification/train
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train")
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")

train_transforms = [ttf.ToTensor(), ttf.ColorJitter(brightness=(0.8,1.2), contrast=(0.8,1.2), saturation=(0.8,1.2)),
                     ttf.RandomHorizontalFlip(p=0.5)]
val_transforms = [ttf.ToTensor()]

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_transforms))
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose(val_transforms))


train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=2, pin_memory=True)

print(train_dataset[0][0].size())

torch.Size([3, 224, 224])


# Setup everything for training

In [None]:
for i, (x, y) in enumerate(train_loader):
  print(len(y))
  if i>0:
    break

200
200


In [None]:
model = ConvNext(in_channels=3) #initializing an instance of Network class
model.cuda() #to move parameters to GPU

num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))

criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.3)


optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * n_epochs))
# T_max is "how many times will i call scheduler.step() until it reaches 0 lr?"

# For this homework, we strongly strongly recommend using FP16 to speed up training.
# It helps more for larger models.
# Go to https://effectivemachinelearning.com/PyTorch/8._Faster_training_with_mixed_precision
# and compare "Single precision training" section with "Mixed precision training" section
scaler = torch.cuda.amp.GradScaler()

Number of Params: 32743164



Model checkpoint saving function



In [None]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

Checkpoint loading

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer, scheduler):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize optimizer from checkpoint to optimizer
    scheduler.load_state_dict(checkpoint['scheduler'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_acc_max = checkpoint['valid_acc_max']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], scheduler, valid_acc_max

## Training

In [None]:
def train(start_epochs, n_epochs, valid_acc_max_input, train_loader, val_loader,val_dataset, model, optimizer, criterion, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss
  valid_acc_max = valid_acc_max_input 

  for epoch in range(start_epochs, n_epochs+1):
  # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0
    total_loss_val = 0

    # training
    model.train()
    for i, (x, y) in enumerate(train_loader):

      optimizer.zero_grad()

      x = x.cuda() # moving data to GPU
      y = y.cuda()

      # Don't be surprised - we just wrap these two lines to make it work for FP16
      with torch.cuda.amp.autocast():     
          outputs = model(x)
          loss = criterion(outputs, y)

      # Update # correct & loss as we go
      num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
      total_loss += float(loss)

      # tqdm lets you add some details so you can monitor training as you train.
      batch_bar.set_postfix(
          acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
          loss="{:.04f}".format(float(total_loss / (i + 1))),
          num_correct=num_correct,
          lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
      
      # Another couple things you need for FP16. 
      scaler.scale(loss).backward() # This is a replacement for loss.backward()
      scaler.step(optimizer) # This is a replacement for optimizer.step()
      scaler.update() # This is something added just for FP16

      scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

      batch_bar.update() # Update tqdm bar

      # to avoid cuda out of memory error
      del x
      del y
      del loss

    batch_bar.close()

    # Validation 
    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
    num_correct_val = 0
    for i, (x, y) in enumerate(val_loader):
      x = x.cuda()
      y = y.cuda()

      with torch.no_grad():
          outputs = model(x)
          loss = criterion(outputs, y)

      num_correct_val += int((torch.argmax(outputs, axis=1) == y).sum())
      total_loss_val += float(loss)
      batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct_val/ ((i + 1) * batch_size)))

      batch_bar.update()

      # to avoid cuda out of memory error
      del x
      del y
      del loss
    batch_bar.close()
    
    val_acc = 100 * num_correct_val / len(val_dataset)

    print("Val acc: {:.04f}%".format(val_acc))

    print("Epoch {}/{}: Train Acc {:.04f}%, Val acc: {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
        epoch,
        n_epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        val_acc,
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr'])))

    

    # create checkpoint variable and add important data
    checkpoint = {
        'epoch': epoch,
        'valid_acc_max': val_acc,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
    }

    # save checkpoint
    save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
    ## save the model if validation acc has increased
    if val_acc >= valid_acc_max:
        print('Validation acc increased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_acc_max,val_acc))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_acc_max = val_acc
    
  return model


# torch.save(model.state_dict(), "/content/drive/MyDrive/MobileNetV2.pth")



In [None]:
# to avoid cuda out of memory error
torch.cuda.empty_cache( )
gc.collect( )

# NINF is negative infinity
trained_model = train(1, n_epochs, np.NINF, train_loader, val_loader, val_dataset, model, optimizer, criterion, "/content/drive/MyDrive/hw2p2_saved_models/current_checkpoint.pt", "/content/drive/MyDrive/hw2p2_saved_models/best_model.pt")



Val acc: 0.5714%
Epoch 1/70: Train Acc 0.1429%, Val acc: 0.5714%, Train Loss 8.6469, Learning Rate 0.0999
Validation acc increased (-inf --> 0.571429).  Saving model ...




Val acc: 4.0000%
Epoch 2/70: Train Acc 1.8614%, Val acc: 4.0000%, Train Loss 8.0181, Learning Rate 0.0998
Validation acc increased (0.571429 --> 4.000000).  Saving model ...




Val acc: 15.2400%
Epoch 3/70: Train Acc 9.5671%, Val acc: 15.2400%, Train Loss 7.3111, Learning Rate 0.0995
Validation acc increased (4.000000 --> 15.240000).  Saving model ...




Val acc: 29.2514%
Epoch 4/70: Train Acc 26.1886%, Val acc: 29.2514%, Train Loss 6.5815, Learning Rate 0.0992
Validation acc increased (15.240000 --> 29.251429).  Saving model ...




Val acc: 46.1914%
Epoch 5/70: Train Acc 46.3079%, Val acc: 46.1914%, Train Loss 5.9040, Learning Rate 0.0987
Validation acc increased (29.251429 --> 46.191429).  Saving model ...




Val acc: 56.9143%
Epoch 6/70: Train Acc 61.9636%, Val acc: 56.9143%, Train Loss 5.3617, Learning Rate 0.0982
Validation acc increased (46.191429 --> 56.914286).  Saving model ...




Val acc: 62.8743%
Epoch 7/70: Train Acc 72.8536%, Val acc: 62.8743%, Train Loss 4.9593, Learning Rate 0.0976
Validation acc increased (56.914286 --> 62.874286).  Saving model ...




Val acc: 68.9800%
Epoch 8/70: Train Acc 80.0829%, Val acc: 68.9800%, Train Loss 4.6655, Learning Rate 0.0968
Validation acc increased (62.874286 --> 68.980000).  Saving model ...




Val acc: 69.9086%
Epoch 9/70: Train Acc 85.1057%, Val acc: 69.9086%, Train Loss 4.4441, Learning Rate 0.0960
Validation acc increased (68.980000 --> 69.908571).  Saving model ...




Val acc: 71.5057%
Epoch 10/70: Train Acc 88.5857%, Val acc: 71.5057%, Train Loss 4.2740, Learning Rate 0.0950
Validation acc increased (69.908571 --> 71.505714).  Saving model ...




Val acc: 73.4229%
Epoch 11/70: Train Acc 91.3943%, Val acc: 73.4229%, Train Loss 4.1339, Learning Rate 0.0940
Validation acc increased (71.505714 --> 73.422857).  Saving model ...




Val acc: 73.9143%
Epoch 12/70: Train Acc 93.6543%, Val acc: 73.9143%, Train Loss 4.0192, Learning Rate 0.0929
Validation acc increased (73.422857 --> 73.914286).  Saving model ...




Val acc: 70.2200%
Epoch 13/70: Train Acc 95.6586%, Val acc: 70.2200%, Train Loss 3.9206, Learning Rate 0.0917




Val acc: 73.8629%
Epoch 14/70: Train Acc 97.3543%, Val acc: 73.8629%, Train Loss 3.8387, Learning Rate 0.0905




Val acc: 74.8371%
Epoch 15/70: Train Acc 98.4893%, Val acc: 74.8371%, Train Loss 3.7732, Learning Rate 0.0891
Validation acc increased (73.914286 --> 74.837143).  Saving model ...




Val acc: 72.0229%
Epoch 16/70: Train Acc 99.0686%, Val acc: 72.0229%, Train Loss 3.7293, Learning Rate 0.0877




Val acc: 75.4914%
Epoch 17/70: Train Acc 99.3707%, Val acc: 75.4914%, Train Loss 3.6961, Learning Rate 0.0861
Validation acc increased (74.837143 --> 75.491429).  Saving model ...




Val acc: 70.2314%
Epoch 18/70: Train Acc 99.5300%, Val acc: 70.2314%, Train Loss 3.6699, Learning Rate 0.0846




Val acc: 74.2171%
Epoch 19/70: Train Acc 99.6193%, Val acc: 74.2171%, Train Loss 3.6535, Learning Rate 0.0829




Val acc: 75.8400%
Epoch 20/70: Train Acc 99.6936%, Val acc: 75.8400%, Train Loss 3.6366, Learning Rate 0.0812
Validation acc increased (75.491429 --> 75.840000).  Saving model ...




Val acc: 73.1657%
Epoch 21/70: Train Acc 99.7621%, Val acc: 73.1657%, Train Loss 3.6231, Learning Rate 0.0794




Val acc: 72.6514%
Epoch 22/70: Train Acc 99.7993%, Val acc: 72.6514%, Train Loss 3.6105, Learning Rate 0.0775




Val acc: 58.5686%
Epoch 23/70: Train Acc 99.8179%, Val acc: 58.5686%, Train Loss 3.6006, Learning Rate 0.0756


Train:  13%|█▎        | 89/700 [02:39<17:57,  1.76s/it, acc=99.8000%, loss=3.5952, lr=0.0754, num_correct=17964]

In [None]:
# define optimzer
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * n_epochs))
# define checkpoint saved path
ckp_path = "/content/drive/MyDrive/hw2p2_saved_models/best_model.pt"

In [None]:
# load the saved checkpoint
model, optimizer, epoch, scheduler, valid_acc_max = load_ckp(ckp_path, model, optimizer, scheduler)
start_epoch = epoch+1

In [None]:

print("model = ", model)
print("optimizer = ", optimizer)
print("scheduler= ", scheduler)
print("start_epoch = ", start_epoch)
# print("valid_acc_min = {}".format(valid_acc_max)
print("valid_acc_max = {}%".format(valid_acc_max))

model =  ConvNext(
  (down_sample_layers): ModuleList(
    (0): Sequential(
      (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Conv2d(96, 192, kernel_size=(2, 2), stride=(2, 2))
    )
    (2): Sequential(
      (0): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Conv2d(192, 384, kernel_size=(2, 2), stride=(2, 2))
    )
    (3): Sequential(
      (0): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Conv2d(384, 758, kernel_size=(2, 2), stride=(2, 2))
    )
  )
  (block_layers): ModuleList(
    (0): Sequential(
      (0): Block(
        (dwconv): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96)
        (bn): BatchNorm2d(96, eps=1e-05, momentum=0.1, 

In [None]:
trained_model = train(start_epoch, 50, valid_acc_max, train_loader, val_loader, val_dataset, model, optimizer, criterion, "/content/drive/MyDrive/hw2p2_saved_models/current_checkpoint.pt", "/content/drive/MyDrive/hw2p2_saved_models/best_model.pt")




Val acc: 59.4800%
Epoch 21/50: Train Acc 96.6825%, Val acc: 59.4800%, Train Loss 0.3169, Learning Rate 0.0006




Val acc: 59.4400%
Epoch 22/50: Train Acc 96.5874%, Val acc: 59.4400%, Train Loss 0.3217, Learning Rate 0.0024




Val acc: 59.0943%
Epoch 23/50: Train Acc 96.4965%, Val acc: 59.0943%, Train Loss 0.3292, Learning Rate 0.0054




Val acc: 58.5914%
Epoch 24/50: Train Acc 96.3928%, Val acc: 58.5914%, Train Loss 0.3379, Learning Rate 0.0095




Val acc: 57.8200%
Epoch 25/50: Train Acc 96.1088%, Val acc: 57.8200%, Train Loss 0.3499, Learning Rate 0.0146




Val acc: 54.4286%
Epoch 26/50: Train Acc 95.4449%, Val acc: 54.4286%, Train Loss 0.3765, Learning Rate 0.0206


Train:  59%|█████▉    | 323/546 [03:07<02:09,  1.72it/s, acc=95.0171%, loss=0.3826, lr=0.0245, num_correct=78811]

KeyboardInterrupt: ignored

## Testing classification and generating submission file

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose(val_transforms))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         drop_last=False, num_workers=1)

In [None]:
model.eval()
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')

res = []
for i, (x) in enumerate(test_loader):

    # TODO: Finish predicting on the test set.
    x = x.cuda()
    with torch.no_grad():
        outputs = model(x)
        
    pred_y = torch.argmax(outputs, axis=1)
    res.extend(pred_y.tolist())

    batch_bar.update()
    
batch_bar.close()

Test:  34%|███▎      | 46/137 [00:27<00:52,  1.73it/s]

KeyboardInterrupt: ignored

In [None]:
with open("classification_early_submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(test_dataset)):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

##Submission to Kaggle

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-classification -f classification_early_submission.csv -m "MobileNetV2_fulldata"

100% 541k/541k [00:00<00:00, 2.57MB/s]
Successfully submitted to Face Recognition

## Verification

Downloading verif data

In [None]:
!ls verification/verification/dev | wc -l
!cat verification/verification/verification_dev.csv | wc -l

6000
166801


In [None]:
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

Verification: Validation

In [None]:
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose(val_transforms))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try the final outputs too!
        feats = model(imgs, return_features=True) 
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.
    # feats_dict[path_names] = feats
    for index, value in enumerate(path_names):
      feats_dict[value] = feats[index]



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = torch.nn.CosineSimilarity(dim=0, eps=1e-6) 

val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
gt_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2, gt = line.split(",")

    # TODO: Use the similarity metric
    # How to use these img_paths? What to do with the features?
    img_path1 = img_path1.split("/")[1]
    img_path2 = img_path2.split("/")[1]
    similarity = float(similarity_metric( feats_dict[img_path1], feats_dict[img_path2] ))

    gt_similarities.append(int(gt))
    pred_similarities.append(similarity)


pred_similarities = np.array(pred_similarities)
gt_similarities = np.array(gt_similarities)

print("AUC:", roc_auc_score(gt_similarities, pred_similarities))



AUC: 0.9666614044995622


Verification: Testing and submission

In [None]:
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"),
                                        ttf.Compose(val_transforms))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=batch_size, 
                                              shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!
        feats = model(imgs, return_features=True) 
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.
    for index, value in enumerate(path_names):
      feats_dict[value] = feats[index]



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = torch.nn.CosineSimilarity(dim=0, eps=1e-6) 
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2 = line.split(",")

    # TODO: Finish up verification testing.
    # How to use these img_paths? What to do with the features?
    img_path1 = img_path1.split("/")[1]
    img_path2 = img_path2.split("/")[1]
    similarity = float(similarity_metric( feats_dict[img_path1], feats_dict[img_path2] ))

    # gt_similarities.append(int(gt))
    pred_similarities.append(similarity)



In [None]:
with open("verification_early_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-verification -f verification_early_submission.csv -m "Convnext 70 epochs return final layer cat with out through many gelus"

100% 16.9M/16.9M [00:05<00:00, 2.95MB/s]
Successfully submitted to Face Verification