In [2]:
!pip install lightning -qU
!pip install wandb -qU

In [3]:
import os

# torch related dependencies
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split, Subset
from torchmetrics.functional import accuracy

#wandb
import wandb

#Lighting
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch  import Trainer
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping


In [4]:
import uuid

In [5]:
import torch._dynamo
torch._dynamo.config.cache_size_limit = 64

In [6]:
from kaggle_secrets import UserSecretsClient
api_key = UserSecretsClient().get_secret('wandb_api')

wandb.login(key=api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrohitrk06[0m ([33mrohitrk06-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
train_dataset_dir = "/kaggle/input/nature-12k/inaturalist_12K/train"
test_dataset_dir = "/kaggle/input/nature-12k/inaturalist_12K/val"

In [20]:
train_dataset = datasets.ImageFolder(root = train_dataset_dir,
                               transform=transforms.Compose([
                                   transforms.Resize((256,256)),
                                   transforms.ToTensor(),
                               ])
                               )

test_dataset = datasets.ImageFolder(root = test_dataset_dir,
                               transform=transforms.Compose([
                                   transforms.Resize((256,256)),
                                   transforms.ToTensor(),
                               ])
                               )

train_set_size = int(len(train_dataset)*0.8)
valid_set_size = len(train_dataset) - train_set_size

training_set, validation_set = random_split(train_dataset,[train_set_size, valid_set_size])

# train_subset = Subset(training_set,range(100))
# val_subset = Subset(validation_set,range(20))

training_loader = DataLoader(training_set,batch_size = 32, shuffle = True,num_workers = 3)
validation_loader = DataLoader(validation_set,batch_size = 32,num_workers = 3)
test_loader = DataLoader(test_dataset,batch_size = 32)

In [10]:
class CNN(nn.Module):
    def __init__(self,
                 input_dim,
                 output_num_classes,
                 activation_fn = 'ReLU',
                 num_layers = 5,
                 num_filters = 64,
                 filter_sizes = 3,
                 conv_padding = [0,0,0,0,0],
                 conv_strides = 1,
                 pooling_filter_sizes = 3,
                 pooling_strides = 1,
                 pooling_padding = [0,0,0,0,0],
                 num_dense_neurons = 128,
                 add_batchNorm = True,
                 add_dropout = True,
                 dl_dropout_prob = 0.5,
                 ap_dropout_prob=0.2):
        '''
        Params:
            input_shape: the shape of the input image
            output_num_classes: Number of classes in multiclass classification
            num_layers: Total number of "convolution - activation - pooling" layers
            num_filters: (int/list): Total number of filters in each conv layers
            filter_sizes: (int/list): Size of filters in each conv layer
            conv_padding: (int/list): Padding in each conv layer
            conv_strides: (int/list)
            pooling_filter_sizes: (int/list)
            pooling_strides: (int/list)
            pooling_padding: (int/list)
            dl_dropout_prob: Dropout probability in the dense layers in CNN architecture
            ap_dropout_prob: Dropout probability after the pooling layer in CNN architecture 
            add_batchNorm: Add Batch Normalisation in the architecture
        '''
        super().__init__()

        if isinstance(num_filters,int):
            self.num_filters = [num_filters] * num_layers
        elif isinstance(num_filters,list):
            self.num_filters = num_filters
        else:
            raise ValueError("num_filters should be either of type int or list")

        if isinstance(filter_sizes,int):
            self.filter_sizes = [filter_sizes] * num_layers
        elif isinstance(filter_sizes,list):
            self.filter_sizes = filter_sizes
        else:
            raise ValueError("filter_sizes should be either of type int or list")

        if isinstance(conv_padding,int):
            self.conv_padding = [conv_padding] * num_layers
        elif isinstance(conv_padding,list):
            self.conv_padding = conv_padding
        else:
            raise ValueError("conv_padding should be either of type int or list")

        if isinstance(conv_strides,int):
            self.conv_strides = [conv_strides] * num_layers
        elif isinstance(conv_strides,list):
            self.conv_strides = conv_strides
        else:
            raise ValueError("conv_strides should be either of type int or list")

        if isinstance(pooling_filter_sizes,int):
            self.pooling_filter_sizes = [pooling_filter_sizes] * num_layers
        elif isinstance(pooling_filter_sizes,list):
            self.conv_strides = pooling_filter_sizes
        else:
            raise ValueError("pooling_filter_sizes should be either of type int or list")

        if isinstance(pooling_strides,int):
            self.pooling_strides = [pooling_strides] * num_layers
        elif isinstance(pooling_strides,list):
            self.pooling_strides = pooling_strides
        else:
            raise ValueError("pooling_strides should be either of type int or list")

        if isinstance(pooling_padding,int):
            self.pooling_padding = [pooling_padding] * num_layers
        elif isinstance(pooling_padding,list):
            self.pooling_padding = pooling_padding
        else:
            raise ValueError("pooling_padding should be either of type int or list")

        self.activation_fn = None

        if activation_fn == 'ReLU':
            self.activation_fn = nn.ReLU
        elif activation_fn=='GELU':
            self.activation_fn = nn.GELU
        elif activation_fn == 'SiLU':
            self.activation_fn = nn.SiLU
        elif activation_fn == 'Mish':
            self.activation_fn = nn.Mish
        else: 
            raise ValueError(f"{activation_fn} is not supported")
        
        layers = []
        dimensions = input_dim
        for i in range(num_layers):
            if i!=0 and add_dropout:
                layers.append(nn.Dropout(p=ap_dropout_prob))
            
            layers.append(nn.Conv2d(dimensions[0],self.num_filters[i],self.filter_sizes[i],self.conv_strides[i],self.conv_padding[i]))
            
            if add_batchNorm:
                layers.append(nn.BatchNorm2d(self.num_filters[i]))
            
            height = (dimensions[1] + 2 * self.conv_padding[i] - self.filter_sizes[i])//self.conv_strides[i] + 1
            width = (dimensions[2] + 2 * self.conv_padding[i] - self.filter_sizes[i])//self.conv_strides[i] + 1
            dimensions = (self.num_filters[i],height,width)

            layers.append(self.activation_fn())
            layers.append(nn.MaxPool2d(self.pooling_filter_sizes[i],self.pooling_strides[i], self.pooling_padding[i]))

            height = (dimensions[1] + 2 * self.pooling_padding[i] - self.pooling_filter_sizes[i])//self.pooling_strides[i] + 1
            width = (dimensions[2] + 2 * self.pooling_padding[i] - self.pooling_filter_sizes[i])//self.pooling_strides[i] + 1
            dimensions = (self.num_filters[i],height,width)


        self.features = nn.Sequential(
            *layers,
            nn.Flatten()
        )

        classifier_layers = []
        if add_dropout:
            classifier_layers.append(nn.Dropout(p = dl_dropout_prob))
        classifier_layers.append(nn.Linear(dimensions[0]*dimensions[1]*dimensions[2],num_dense_neurons))
        if add_batchNorm:
            classifier_layers.append(nn.BatchNorm1d(num_dense_neurons))
        classifier_layers.append(self.activation_fn())
        classifier_layers.append(nn.Linear(num_dense_neurons,output_num_classes))
        self.classifier = nn.Sequential(
            *classifier_layers             
        )
    def forward(self,X):
        return self.classifier(self.features(X))

In [11]:
class LitCNN(L.LightningModule):
    def __init__(self,
                 input_dim = (3,256,256),
                 output_num_classes = 10,
                 activation_fn = 'ReLU',
                 num_layers = 5,
                 num_filters = 64,
                 filter_sizes = 3,
                 conv_padding = [0,0,0,0,0],
                 conv_strides = 1,
                 pooling_filter_sizes = 3,
                 pooling_strides = 1,
                 pooling_padding = [0,0,0,0,0],
                 num_dense_neurons = 128,
                 add_batchNorm = True,
                 add_dropout = True,
                 dl_dropout_prob = 0.5,
                 ap_dropout_prob=0.2,
                 lr=1e-4):
        super().__init__()
        self.cnn = CNN(
            input_dim,
            output_num_classes,
            activation_fn,
            num_layers,
            num_filters,
            filter_sizes,
            conv_padding,
            conv_strides,
            pooling_filter_sizes,
            pooling_strides,
            pooling_padding,
            num_dense_neurons,
            add_batchNorm,
            add_dropout,
            dl_dropout_prob,
            ap_dropout_prob,
        )
        self.loss = CrossEntropyLoss()

        self.lr = lr
        
        self.save_hyperparameters()
        
    def training_step(self,batch,batch_idx):
        _,loss,acc = self._get_preds_loss_accuracy(batch)

        #Log loss and metric
        self.log('train_loss',loss,sync_dist=True)
        self.log('train_accuracy',acc,sync_dist=True)

        # print("train_loss", loss)
        # print("train_accuracy",acc)

        return loss

    def test_step(self,batch,batch_idx):
        _, loss,acc = self._get_preds_loss_accuracy(batch)

        #Log loss and Metric
        self.log('test_loss',loss,sync_dist=True)
        self.log('test_accuracy',acc,sync_dist=True)
        
    def validation_step(self,batch,batch_idx):
        preds,loss,acc = self._get_preds_loss_accuracy(batch)

        # Log loss and metric
        self.log('val_loss', loss,sync_dist=True)
        self.log('val_accuracy',acc,sync_dist=True)

        return preds
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.cnn.parameters(),lr = self.lr)
        return optimizer

    def _get_preds_loss_accuracy(self,batch):
        images,labels = batch
        logits = self.cnn(images)
        preds = torch.argmax(logits,dim=1)
        loss = self.loss(logits, labels)
        acc = accuracy(preds,labels,'multiclass', num_classes = 10)
        return preds, loss, acc

In [13]:
model = LitCNN(
    input_dim = (3,256,256),
    output_num_classes = 10,
    activation_fn = 'ReLU',
    num_layers = 5,
    num_filters = [96, 64, 64, 32, 16],
    filter_sizes = 3,
    conv_padding = [0,0,0,0,0],
    conv_strides = [2,2,1,1,1],
    pooling_filter_sizes = 3,
    pooling_strides = 1,
    pooling_padding = [0,0,0,0,0],
    num_dense_neurons = 128,
    add_batchNorm = True,
    add_dropout = True,
    dl_dropout_prob = 0.5,
    ap_dropout_prob=0.1,
    lr = 1e-4
)
model = torch.compile(model)

In [14]:
class LogPredictionsCallback(Callback):
    def on_validation_batch_end(
        self, trainer,pl_module,outputs,batch,batch_idx
    ):
        if batch_idx == 0:
            no_samples = 20
            images,labels = batch

            columns = ['Image', 'Ground Truth', 'prediction']
            data = [[wandb.Image(x_i), y_i, y_pred] for x_i,y_i,y_pred in list(zip(images[:no_samples], labels[:no_samples],outputs[:no_samples]))]
            wandb_logger.log_table(key = 'Prediction on Validation Set', columns = columns, data = data)



In [15]:
def create_cnn_sweep_config_name(config):
    return (
        f"nl{config.num_layers}_"
        f"nf{config.num_filters}_"
        f"act{config.activation_fn}_"
        f"fs{config.filter_sizes}_"
        f"cp{config.conv_padding}_"
        f"cs{config.conv_strides}_"
        f"pfs{config.pooling_filter_sizes}_"
        f"ps{config.pooling_strides}_"
        f"pp{config.pooling_padding}_"
        f"dense{config.num_dense_neurons}_"
        f"do{int(config.add_dropout)}_"
        f"dl_do{config.dl_dropout_prob}_"
        f"ap_do{config.ap_dropout_prob}_"
        f"bn{int(config.add_batchNorm)}_"
        f"ep{config.max_epochs}_"
        f"lr{config.lr:.0e}"
    )


In [16]:
def main(config = None):
    wandb.init(project = "da6401_assignment2",
               config = config)
    config = wandb.config
    config_group = create_cnn_sweep_config_name(config)
    wandb.config.update({"config_group": config_group}, allow_val_change=True)
    wandb.run.name = name=f"{config_group}_run_{uuid.uuid4().hex[:4]}"
    
    wandb_logger = WandbLogger(project = "da6401_assignment2")
    
    model = LitCNN(
        input_dim = (3,256,256),
        output_num_classes = 10,
        activation_fn = config.activation_fn,
        num_layers = config.num_layers,
        num_filters = config.num_filters,
        filter_sizes = config.filter_sizes,
        conv_padding = config.conv_padding,
        conv_strides = config.conv_strides,
        pooling_filter_sizes = config.pooling_filter_sizes,
        pooling_strides = config.pooling_strides,
        pooling_padding = config.pooling_padding,
        num_dense_neurons = config.num_dense_neurons,
        add_batchNorm = config.add_batchNorm,
        add_dropout = config.add_dropout,
        dl_dropout_prob = config.dl_dropout_prob,
        ap_dropout_prob=config.ap_dropout_prob,
        lr = config.lr
    )
    model = torch.compile(model)

    # log_predictions_callback = LogPredictionsCallback()
    # checkpoint_callback = ModelCheckpoint(monitor='val_accuracy', mode='max')

    trainer = Trainer(
        logger = wandb_logger,
        callbacks = [EarlyStopping(monitor="val_accuracy", mode = "max",patience=4)],
        max_epochs = config.max_epochs,
        precision="16-mixed",
        devices = 2,
    )
    
    trainer.fit(model,training_loader,validation_loader)
    wandb.finish()

    

In [None]:
sweep_config = {
    "name": "Hyperparameter Sweep for same filter sizes",
    "method": "bayes",
    "metric": {
        "name": "val_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "num_layers": {
            "value": 5
        },
        "activation_fn":{
            "values":["ReLU", "GELU","SiLU","Mish"]
        },
        "add_data_augmentation":{
              "values":[True, False]  
        },    
        "num_filters": {
            "values": [
                16,
                32,
                64,
                # [16, 32, 64, 128, 256],
                # [32, 64, 128, 256, 512],
                [256, 128, 64, 32, 16],
                [512, 256, 128, 64, 32]
            ]
        },
        "filter_sizes": {
            "values": [3, 5]
        },
        "conv_padding": {
            "values": [0, 1, 2]
        },
        "conv_strides": {
            "values": [1]
        },
        "pooling_filter_sizes": {
            "values": [3, 5]
        },
        "pooling_strides": {
            "values": [1, 2]
        },
        "pooling_padding": {
            "values": [0, 1 ]
        },
        "num_dense_neurons": {
            "values": [64, 128, 256, 512]
        },
        "add_dropout":{
          "values":[True, False]  
        },
        "dl_dropout_prob": {
            "distribution": "uniform",
            "min": 0.3,
            "max": 0.7
        },
        "ap_dropout_prob": {
            "distribution": "uniform",
            "min": 0.0,
            "max": 0.3
        },
        "add_batchNorm": {
            "values": [True, False]
        },
        "max_epochs":{
            "values":[10,15]
        },
        "lr": {
            "distribution": "log_uniform_values",
            "min": 1e-5,
            "max": 1e-3
        }
    },
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 3,
        "eta": 2
    }
}

In [25]:
sweep_id = wandb.sweep(sweep_config,project="da6401_assignment2")

Create sweep with ID: bwsgamay
Sweep URL: https://wandb.ai/rohitrk06-indian-institute-of-technology-madras/da6401_assignment2/sweeps/bwsgamay


In [26]:
wandb.agent(sweep_id,main,count=1)

[34m[1mwandb[0m: Agent Starting Run: muhdxj9e with config:
[34m[1mwandb[0m: 	activation_fn: GELU
[34m[1mwandb[0m: 	add_batchNorm: True
[34m[1mwandb[0m: 	add_dropout: True
[34m[1mwandb[0m: 	ap_dropout_prob: 0.2750528465012813
[34m[1mwandb[0m: 	conv_padding: 2
[34m[1mwandb[0m: 	conv_strides: 1
[34m[1mwandb[0m: 	dl_dropout_prob: 0.6853927157157438
[34m[1mwandb[0m: 	filter_sizes: 3
[34m[1mwandb[0m: 	lr: 0.00030405777641535464
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	num_dense_neurons: 256
[34m[1mwandb[0m: 	num_filters: [256, 128, 64, 32, 16]
[34m[1mwandb[0m: 	num_layers: 5
[34m[1mwandb[0m: 	pooling_filter_sizes: 3
[34m[1mwandb[0m: 	pooling_padding: 1
[34m[1mwandb[0m: 	pooling_strides: 2


INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

/usr/local/lib/python3.11/dist-packages/lightning/pytorch/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `Wand

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_accuracy,▁▃▃▃▄▄▅▃█▅▇▆▆▆▇▆▅▇▅▅█▆▆▆▇
train_loss,█▆█▆▆▇▄▇▃▄▃▄▂▄▅▄▃▁▅▃▂▂▃▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▁▂▂▃▅▃▇▄█▄
val_loss,██▇▆▅▅▃▇▁▆

0,1
epoch,9.0
train_accuracy,0.42188
train_loss,1.70184
trainer/global_step,1249.0
val_accuracy,0.26
val_loss,2.07203
