In [1]:
!pip install lightning -qU
!pip install wandb -qU

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m818.9/818.9 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
import torchvision.models as models
import os


# torch related dependencies
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split, Subset, Dataset
from torchmetrics.functional import accuracy

#wandb
import wandb

#Lighting
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch  import Trainer
from lightning.pytorch.callbacks import Callback
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

In [30]:
import torch._dynamo
torch._dynamo.config.cache_size_limit = 64

In [20]:
from kaggle_secrets import UserSecretsClient
api_key = UserSecretsClient().get_secret('wandb_api')

wandb.login(key=api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrohitrk06[0m ([33mrohitrk06-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [19]:
train_dataset_dir = "/kaggle/input/nature-12k/inaturalist_12K/train"
test_dataset_dir = "/kaggle/input/nature-12k/inaturalist_12K/val"

In [35]:
transform = transforms.Compose([
    transforms.Resize((232,232)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


train_dataset = datasets.ImageFolder(root = train_dataset_dir,
                                    transform=transform)
test_dataset = datasets.ImageFolder(root = test_dataset_dir,
                               transform=transform)

train_set_size = int(len(train_dataset)*0.8)
valid_set_size = len(train_dataset) - train_set_size

training_set, validation_set = random_split(train_dataset,[train_set_size, valid_set_size])

train_loader = DataLoader(training_set,batch_size = 32,num_workers = 4)
validation_loader = DataLoader(validation_set,batch_size = 32,num_workers = 4)
test_loader = DataLoader(test_dataset,batch_size = 32, num_workers = 4, shuffle=True)

In [25]:
pretrained_model = models.resnet50(weights='DEFAULT')

#freeze all layers
for param in pretrained_model.parameters():
    param.requires_grad = False

#modify last layer
num_features = pretrained_model.fc.in_features
pretrained_model.fc = nn.Linear(num_features,10)

In [26]:
class pretrainedResNet(L.LightningModule):
    def __init__(self, model,lr):
        super().__init__()
        self.cnn = model
        self.loss = CrossEntropyLoss()
        self.lr = lr
        
    def training_step(self,batch,batch_idx):
        _,loss,acc = self._get_preds_loss_accuracy(batch)

        #Log loss and metric
        self.log('train_loss',loss,sync_dist=True)
        self.log('train_accuracy',acc,sync_dist=True)

        # print("train_loss", loss)
        # print("train_accuracy",acc)

        return loss

    def test_step(self,batch,batch_idx):
        preds, loss,acc = self._get_preds_loss_accuracy(batch)

        #Log loss and Metric
        self.log('test_loss',loss,sync_dist=True)
        self.log('test_accuracy',acc,sync_dist=True)

        return preds
        
    def validation_step(self,batch,batch_idx):
        preds,loss,acc = self._get_preds_loss_accuracy(batch)

        # Log loss and metric
        self.log('val_loss', loss,sync_dist=True)
        self.log('val_accuracy',acc,sync_dist=True)

        return preds
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.cnn.parameters(),lr = self.lr)
        return optimizer

    def _get_preds_loss_accuracy(self,batch):
        images,labels = batch
        logits = self.cnn(images)
        preds = torch.argmax(logits,dim=1)
        loss = self.loss(logits, labels)
        acc = accuracy(preds,labels,'multiclass', num_classes = 10)
        return preds, loss, acc

In [32]:
model = pretrainedResNet(pretrained_model,1e-03)
model = torch.compile(model)

In [33]:
class LogPredictionsCallback(Callback):
    def on_test_batch_end(self, trainer,pl_module,outputs,batch,batch_idx):
        if batch_idx == 0:
            no_samples = 20
            images,labels = batch

            columns = ['Image', 'Ground Truth', 'Prediction']
            data = [[wandb.Image(x_i), y_i, y_pred] for x_i,y_i,y_pred in list(zip(images[:no_samples], labels[:no_samples],outputs[:no_samples]))]
            wandb_logger.log_table(key = 'Prediction on Validation Set', columns = columns, data = data)



In [34]:
wandb_logger = WandbLogger(project = "da6401_assignment2",)
log_predictions_callback = LogPredictionsCallback()
trainer = Trainer(
    logger = wandb_logger,
    callbacks = [EarlyStopping(monitor="val_accuracy", mode = "max"),
                 log_predictions_callback],
    max_epochs = 10,
    precision="16-mixed",
    devices = 2,
    )
trainer.fit(model,train_loader,validation_loader)
trainer.test(model,test_loader)

wandb.finish()

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------



INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: 
  | Name | Type             | Params | Mode 
--------------------------------------------------
0 | cnn  | ResNet           | 23.5 M | train
1 | loss | CrossEntropyLoss | 0      | train
--------------------------------------------------
20.5 K    Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.114    Total estimated model params size (MB)
152       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------



INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:216: Using `DistributedSampler` with the dataloaders. During `trainer.test()`, it is recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates some samples to make sure all devices have same batch size in case of uneven inputs.


Testing: |          | 0/? [00:00<?, ?it/s]