# Necessary package import

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from einops import rearrange
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms
from timm.models.efficientnet import efficientnetv2_s
from nn_architecture import *

  from .autonotebook import tqdm as notebook_tqdm


## Check torch summary

In [2]:
import torch

# Clear cache
torch.cuda.empty_cache()

# Optional: Collect garbage (useful if memory is fragmented)
import gc
gc.collect()

# Assuming your model is on the GPU
# model = efficientnetv2_s(pretrained=False)
model = EfficientNetV2S_WithInvolution(num_classes=1486)  # Dynamically set number of classes
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)  # Ensure model is on GPU

# Print model summary
summary(model, input_size=(3, 224, 224))

  WeightNorm.apply(module, name, dim)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
          Identity-1          [-1, 3, 224, 224]               0
          Identity-2          [-1, 3, 224, 224]               0
          Identity-3          [-1, 3, 224, 224]               0
              SiLU-4                    [-1, 9]               0
              SiLU-5                    [-1, 9]               0
              SiLU-6                    [-1, 9]               0
              SiLU-7                    [-1, 9]               0
              SiLU-8                    [-1, 9]               0
              SiLU-9                    [-1, 9]               0
             SiLU-10                    [-1, 9]               0
             SiLU-11                    [-1, 9]               0
             SiLU-12                    [-1, 9]               0
KAN_Convolutional_Layer-13          [-1, 3, 112, 112]               0
      BatchNorm2d-14          [-1

# Model training

In [2]:
import torch

# Clear cache
torch.cuda.empty_cache()

# Optional: Collect garbage (useful if memory is fragmented)
import gc
gc.collect()

29

In [None]:
import torch
import os
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler, autocast
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

training_run = 1

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

# Hyperparameters
batch_size = 128  # Batch size
epochs = 100  # Set the number of epochs
grad_clip = 1.0  # Maximum gradient norm

# Dataset paths
train_dir = "/home/sashankhravi/Datasets/inatbirds100k/train_transformed"
val_dir = "/home/sashankhravi/Datasets/inatbirds100k/val_transformed"

# Dynamically fetch number of classes from the folder structure (number of subfolders)
num_classes = len(os.listdir(train_dir))

# Image resizing and transformation (only resizing to 256x256)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),  # Convert image to tensor
])

# Custom dataset loading
train_dataset = datasets.ImageFolder(train_dir, transform=transform)
val_dataset = datasets.ImageFolder(val_dir, transform=transform)

# Create DataLoader with the sampler
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

# Initialize the model (assuming EfficientNetV2SWithInvolution is defined elsewhere)
model = EfficientNetV2S_WithInvolution(num_classes=num_classes)  # Dynamically set num_classes

# Move model to GPU if available
model = model.to(device)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer (e.g., Adam optimizer)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initialize the gradient scaler for FP16 precision
scaler = GradScaler()

# Initialize lists to store training and validation losses
train_losses = []
val_losses = []

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    running_train_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for step, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass using mixed precision (autocast)
        with autocast(device_type='cuda', dtype=torch.float16):  # Use autocast for FP16 precision
            outputs = model(images)
            loss = criterion(outputs, labels)

        # Backward pass and gradient scaling
        scaler.scale(loss).backward()

        # Gradient Clipping (before optimizer step)
        scaler.unscale_(optimizer)  # Unscale gradients first before clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        # Update the model weights using FP16 gradients and scaling
        scaler.step(optimizer)
        scaler.update()

        # Track loss and accuracy
        running_train_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct_preds += torch.sum(preds == labels)
        total_preds += labels.size(0)
        
        # Print training loss and accuracy every 100 steps
        if step % 500 == 0:
            print(f"Epoch [{epoch + 1}/{epochs}], Step [{step}/{len(train_loader)}], Train Loss: {loss.item():.4f}")
        
        # Save model checkpoint every 1000 steps
        if step % 1000 == 0:
            try:
                os.makedirs(f"model_checkpoints_training_run_{training_run}")
            except:
                continue
            torch.save(model.state_dict(), f"model_checkpoints_training_run_{training_run}/saved_model_epoch_{epoch}.pth")

    avg_train_loss = running_train_loss / len(train_loader)
    train_accuracy = 100 * correct_preds / total_preds
    train_losses.append(avg_train_loss)

    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%")

    # Validation loop
    model.eval()  # Set model to evaluation mode
    running_val_loss = 0.0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            with autocast(device_type='cuda', dtype=torch.float16):  # Use autocast for FP16 precision
                outputs = model(images)

            # Calculate loss
            val_loss = criterion(outputs, labels)
            running_val_loss += val_loss.item()

            # Track accuracy
            _, preds = torch.max(outputs, 1)
            correct_preds += torch.sum(preds == labels)
            total_preds += labels.size(0)

        avg_val_loss = running_val_loss / len(val_loader)
        val_accuracy = 100 * correct_preds / total_preds
        val_losses.append(avg_val_loss)

        print(f"Validation Accuracy: {val_accuracy:.2f}%")

# Plot the cumulative training and validation loss curves
plt.figure(figsize=(10, 5))
plt.plot(range(1, epochs + 1), train_losses, label="Training Loss", color='blue', marker='o')
plt.plot(range(1, epochs + 1), val_losses, label="Validation Loss", color='red', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Cumulative Loss')
plt.title('Cumulative Training and Validation Loss per Epoch')
plt.legend()
plt.grid(True)
plt.show()


Training on: cuda


  WeightNorm.apply(module, name, dim)


Epoch [1/100], Step [0/13003], Train Loss: 7.3071
Epoch [1/100], Step [500/13003], Train Loss: 7.3005
Epoch [1/100], Step [1000/13003], Train Loss: 7.3114
Epoch [1/100], Step [1500/13003], Train Loss: 7.3058
Epoch [1/100], Step [2000/13003], Train Loss: 7.3137
Epoch [1/100], Step [2500/13003], Train Loss: 7.3140
Epoch [1/100], Step [3000/13003], Train Loss: 7.3192
Epoch [1/100], Step [3500/13003], Train Loss: 7.3005
Epoch [1/100], Step [4000/13003], Train Loss: 7.3082
Epoch [1/100], Step [4500/13003], Train Loss: 7.3082
Epoch [1/100], Step [5000/13003], Train Loss: 7.2998
Epoch [1/100], Step [5500/13003], Train Loss: 7.3116
Epoch [1/100], Step [6000/13003], Train Loss: 7.3170
Epoch [1/100], Step [6500/13003], Train Loss: 7.3039
Epoch [1/100], Step [7000/13003], Train Loss: 7.3092
Epoch [1/100], Step [7500/13003], Train Loss: 7.2975
Epoch [1/100], Step [8000/13003], Train Loss: 7.3053
Epoch [1/100], Step [8500/13003], Train Loss: 7.3092
Epoch [1/100], Step [9000/13003], Train Loss: 7.31