In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler, Subset
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
import matplotlib.patheffects as path_effects
import matplotlib.colors as mcolors
import numpy as np
from tqdm.notebook import tqdm
import wandb
import gc
import os
import io
import PIL
import PIL.Image
import copy
import random
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report

gc.collect()
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

CLASSES = [
    'Amphibia',
    'Animalia',
    'Arachnida',
    'Aves',
    'Fungi',
    'Insecta',
    'Mammalia',
    'Mollusca',
    'Plantae',
    'Reptilia'
    ]

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory Usage:")
    print(f"Allocated: {round(torch.cuda.memory_allocated(0)/1024**3, 1)} GB")
    print(f"Cached: {round(torch.cuda.memory_reserved(0)/1024**3, 1)} GB")


# Custom transform to handle image orientation
class RotateIfNeeded:
    def __call__(self, img):
        width, height = img.size
        if height > width:
            return img.transpose(2)  # Rotate 90 degrees if portrait
        return img


def create_prediction_grid(model, test_loader, class_names):
    model.eval()
    all_images = []
    all_preds = []
    all_targets = []
    all_probs = []
    
    # Collect images and predictions
    with torch.no_grad():
        for inputs, targets in test_loader:
            if len(all_images) >= 30:  # We need 30 images for 10x3 grid
                break
                
            inputs = inputs.to(device)
            outputs = model(inputs)
            probs = torch.nn.functional.softmax(outputs, dim=1)
            confidence, preds = torch.max(probs, 1)
            
            # Convert to CPU and save
            batch_images = inputs.cpu().numpy()
            batch_preds = preds.cpu().numpy()
            batch_confidence = confidence.cpu().numpy()
            
            # Add samples to our collections
            for i in range(min(len(batch_images), 30 - len(all_images))):
                all_images.append(batch_images[i])
                all_preds.append(batch_preds[i])
                all_targets.append(targets[i].item())
                all_probs.append(batch_confidence[i])
    
    # Set up the figure with a custom layout
    fig = plt.figure(figsize=(18, 30))
    gs = gridspec.GridSpec(10, 3, figure=fig, wspace=0.2, hspace=0.4)
    
    # Color mapping for correct/incorrect predictions
    correct_color = '#2ecc71'  # Green
    incorrect_color = '#e74c3c'  # Red
    
    # Create custom title for the grid
    fig.suptitle('Model Predictions on Test Data', fontsize=24, y=0.92)
    
    # For each image in our grid
    for i in range(10):
        for j in range(3):
            idx = i * 3 + j
            if idx < len(all_images):
                # Get image and prediction info
                img = all_images[idx].transpose(1, 2, 0)
                # Denormalize the image
                img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
                img = np.clip(img, 0, 1)
                
                pred_class = all_preds[idx]
                true_class = all_targets[idx]
                confidence = all_probs[idx]
                is_correct = pred_class == true_class
                color = correct_color if is_correct else incorrect_color
                
                # Create subplot
                ax = fig.add_subplot(gs[i, j])
                
                # Display image with custom border
                ax.imshow(img)
                ax.set_xticks([])
                ax.set_yticks([])
                
                # Add a colored border based on correctness
                for spine in ax.spines.values():
                    spine.set_linewidth(6)
                    spine.set_color(color)
                
                # Create a top banner with the class name
                pred_name = class_names[pred_class]
                true_name = class_names[true_class]
                
                # Add prediction text
                ax.set_title(f"Prediction: {pred_name}", 
                           fontsize=12, color='white', 
                           bbox=dict(facecolor=color, alpha=0.9, pad=5))
                
                # Add ground truth text below image
                ax.annotate(f"Ground Truth: {true_name}", 
                          xy=(0.5, -0.03), xycoords='axes fraction', 
                          fontsize=10, ha='center', va='top',
                          bbox=dict(facecolor='gray', alpha=0.8, pad=3))
                
                # Add confidence score
                ax.annotate(f"Confidence: {confidence:.1%}", 
                          xy=(0.5, 1.02), xycoords='axes fraction', 
                          fontsize=10, ha='center',
                          bbox=dict(facecolor='#3498db', alpha=0.8, pad=1))
                
                # Add an icon to indicate correctness
                if is_correct:
                    ax.annotate('✓', xy=(0.95, 0.95), xycoords='axes fraction', 
                              fontsize=18, ha='right', va='top', color='white',
                              bbox=dict(facecolor=color, alpha=0.8, boxstyle='circle'))
                else:
                    ax.annotate('✗', xy=(0.95, 0.95), xycoords='axes fraction', 
                              fontsize=18, ha='right', va='top', color='white',
                              bbox=dict(facecolor=color, alpha=0.8, boxstyle='circle'))
                
                # Add a small confidence bar
                bar_width = confidence
                bar_height = 0.04
                bar_y = 0.01
                ax.add_patch(patches.Rectangle(
                    (0.1, bar_y), 0.8 * bar_width, bar_height,
                    transform=ax.transAxes, facecolor='#f39c12', alpha=0.8
                ))
                # Add background for full confidence bar
                ax.add_patch(patches.Rectangle(
                    (0.1, bar_y), 0.8, bar_height,
                    transform=ax.transAxes, facecolor='#bdc3c7', alpha=0.3
                ))
    
    # Add a legend
    legend_elements = [
        patches.Patch(facecolor=correct_color, label='Correct Prediction'),
        patches.Patch(facecolor=incorrect_color, label='Incorrect Prediction')
    ]
    fig.legend(handles=legend_elements, loc='upper center', ncol=2, fontsize=12)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for title and legend
    
    # Save the figure
    plt.savefig('test_predictions.png', dpi=300, bbox_inches='tight')
    
    # Log to wandb if desired
    try:
       
        wandb.log({"test_predictions_grid": wandb.Image(fig)})
    except:
        print("Couldn't log to wandb, continuing...")
    
    return fig

# Function to create data loaders with stratified sampling
def create_data_loaders(train_dir, test_dir, input_size=(224, 224), batch_size=32,
                        data_augmentation=False, val_ratio=0.2, num_workers=2):
    """
    Create data loaders for training, validation, and testing.
    Uses stratified sampling to ensure class balance in validation set.
    """
    # Base transform
    base_transform = [
        RotateIfNeeded(),
        transforms.Resize(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]

    # Training transform with optional augmentation
    if data_augmentation:
        train_transform = transforms.Compose([
            base_transform[0],  # RotateIfNeeded
            base_transform[1],  # Resize
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            base_transform[2],  # ToTensor
            base_transform[3]   # Normalize
        ])
    else:
        train_transform = transforms.Compose(base_transform)

    # Test transform (no augmentation)
    test_transform = transforms.Compose(base_transform)

    # Load datasets
    train_dataset = datasets.ImageFolder(root=train_dir, transform=train_transform)
    test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform)

    # Create stratified train/validation split
    targets = np.array([sample[1] for sample in train_dataset.samples])
    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_ratio, random_state=42)
    train_idx, val_idx = next(sss.split(np.zeros(len(targets)), targets))

    # Create samplers
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=num_workers, pin_memory=True
    )

    val_loader = DataLoader(
        train_dataset, batch_size=batch_size, sampler=val_sampler,
        num_workers=num_workers, pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True
    )

    print(f"Training set: {len(train_idx)} images")
    print(f"Validation set: {len(val_idx)} images")
    print(f"Test set: {len(test_dataset)} images")
    print(f"Number of classes: {len(train_dataset.classes)}")

    return train_loader, val_loader , test_loader, len(train_dataset.classes)

# Enhanced FlexibleCNN model with support for different filter organizations
class FlexibleCNN(nn.Module):
    def __init__(self, in_channels=3, num_filters=32, filter_size=3,
                 activation_fn=nn.ReLU, filter_organization='same',
                 use_batchnorm=False, dropout_rate=0.0,
                 dense_neurons=128, num_classes=10, input_size=(224, 224)):
        super(FlexibleCNN, self).__init__()

        # Determine filter counts based on organization strategy
        if filter_organization == 'same':
            filters = [num_filters] * 5
        elif filter_organization == 'double':
            filters = [num_filters * (2**i) for i in range(5)]
        elif filter_organization == 'half':
            filters = [num_filters // (2**(i)) for i in range(5)]
            filters = [max(16, f) for f in filters]  # Ensure minimum filter count

        # Build layers list
        layers = []

        # First conv block
        layers.append(nn.Conv2d(in_channels, filters[0], kernel_size=filter_size, padding='same'))
        if use_batchnorm:
            layers.append(nn.BatchNorm2d(filters[0]))
        layers.append(activation_fn())
        if dropout_rate > 0:
            layers.append(nn.Dropout2d(dropout_rate))
        layers.append(nn.MaxPool2d(kernel_size=2, stride=2))

        # Remaining conv blocks
        for i in range(1, 5):
            layers.append(nn.Conv2d(filters[i-1], filters[i], kernel_size=filter_size, padding='same'))
            if use_batchnorm:
                layers.append(nn.BatchNorm2d(filters[i]))
            layers.append(activation_fn())
            if dropout_rate > 0:
                layers.append(nn.Dropout2d(dropout_rate))
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))

        self.features = nn.Sequential(*layers)

        # Calculate output dimensions dynamically
        with torch.no_grad():
            x = torch.randn(1, in_channels, *input_size)
            x = self.features(x)
            self.flattened_size = x.numel() // x.size(0)

        # Classifier
        classifier_layers = [
            nn.Flatten(),
            nn.Linear(self.flattened_size, dense_neurons),
            activation_fn()
        ]

        if dropout_rate > 0:
            classifier_layers.append(nn.Dropout(dropout_rate))

        classifier_layers.append(nn.Linear(dense_neurons, num_classes))

        self.classifier = nn.Sequential(*classifier_layers)

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# Dictionary of activation functions
activation_functions = {
    'relu': nn.ReLU,
    'gelu': nn.GELU,
    'silu': nn.SiLU,
    'mish': nn.Mish
}

# Function to train model with wandb logging
def train_with_wandb(config=None):
    with wandb.init(config=config):
        config = wandb.config

        # Create data loaders
        train_loader, val_loader, test_loader, num_classes = create_data_loaders(
            train_dir=config.train_dir,
            test_dir=config.test_dir,
            input_size=(config.input_size,config.input_size),
            batch_size=config.batch_size,
            data_augmentation=config.data_augmentation
        )

        # Create model
        model = FlexibleCNN(
            num_filters=config.num_filters,
            filter_size=config.filter_size,
            activation_fn=activation_functions[config.activation],
            filter_organization=config.filter_organization,
            use_batchnorm=config.use_batchnorm,
            dropout_rate=config.dropout_rate,
            dense_neurons=config.dense_neurons,
            num_classes=num_classes,
            input_size=(config.input_size,config.input_size)
        )

        # Move model to device
        model = model.to(device)

        # Print model info
        print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

        # Define loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

        # Training loop
        for epoch in range(config.epochs):
            # Training phase
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
            model.train()
            train_loss = 0.0
            correct = 0
            total = 0

            with tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.epochs} [Train]") as pbar:
                for inputs, targets in pbar:
                    inputs, targets = inputs.to(device), targets.to(device)

                    # Forward pass
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)

                    # Backward pass
                    loss.backward()
                    optimizer.step()

                    # Track metrics
                    train_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += targets.size(0)
                    correct += predicted.eq(targets).sum().item()

                    # Update progress bar
                    pbar.set_postfix({
                        'loss': train_loss / (pbar.n + 1),
                        'acc': 100. * correct / total
                    })
                    del inputs, targets, outputs, loss
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

            train_loss = train_loss / len(train_loader)
            train_accuracy = 100. * correct / total

            # Validation phase
            model.eval()
            val_loss = 0.0
            correct = 0
            total = 0

            with torch.no_grad():
                with tqdm(val_loader, desc=f"Epoch {epoch+1}/{config.epochs} [Val]") as pbar:
                    for inputs, targets in pbar:
                        inputs, targets = inputs.to(device), targets.to(device)

                        # Forward pass
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)

                        # Track metrics
                        val_loss += loss.item()
                        _, predicted = outputs.max(1)
                        total += targets.size(0)
                        correct += predicted.eq(targets).sum().item()

                        # Update progress bar
                        pbar.set_postfix({
                            'loss': val_loss / (pbar.n + 1),
                            'acc': 100. * correct / total
                        })
                        # Free memory
                        del inputs, targets, outputs, loss
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()

            val_loss = val_loss / len(val_loader)
            val_accuracy = 100. * correct / total

            # Test phase
            model.eval()
            test_loss = 0.0
            correct = 0
            total = 0

            with torch.no_grad():
                with tqdm(test_loader, desc=f"Epoch {epoch+1}/{config.epochs} [Test]") as pbar:
                    for inputs, targets in pbar:
                        inputs, targets = inputs.to(device), targets.to(device)

                        # Forward pass
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)

                        # Track metrics
                        test_loss += loss.item()
                        _, predicted = outputs.max(1)
                        total += targets.size(0)
                        correct += predicted.eq(targets).sum().item()

                        # Update progress bar
                        pbar.set_postfix({
                            'loss': test_loss / (pbar.n + 1),
                            'acc': 100. * correct / total
                        })
                        # Free memory
                        del inputs, targets, outputs, loss
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()

            test_loss = test_loss / len(test_loader)
            test_accuracy = 100. * correct / total

            # Log metrics to wandb
            wandb.log({
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'val_loss': val_loss,
                'val_accuracy': val_accuracy,
                'test_loss':test_loss,
                'test_accuracy':test_accuracy
            })
            create_prediction_grid(model, test_loader, CLASSES)

            print(f"Epoch {epoch+1}/{config.epochs} - "
                  f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%"
                  f"Test Loss: {test_loss:.4f}, test Acc: {test_accuracy:.2f}%")

        return model, val_accuracy

# Define sweep configuration
sweep_config = {
    'method': 'grid',  # Bayesian optimization for efficient search
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'train_dir': {
            'value': '/kaggle/working/nature_12k/inaturalist_12K/train'
        },
        'test_dir': {
            'value': '/kaggle/working/nature_12k/inaturalist_12K/val'
        },
        'batch_size': {
            'values': [32]  # Small batch sizes to avoid OOM
        },
        'input_size':{
            'values':[600]
        },
        'num_filters': {
            'values': [32]  # Filter counts
        },
        'filter_size': {
            'values': [5]  # Filter sizes
        },
        'activation': {
            'values': ['gelu']  # Activation functions
        },
        'filter_organization': {
            'values': ['double']  # Filter organization strategies
        },
        'data_augmentation': {
            'values': [True]  # Whether to use data augmentation
        },
        'use_batchnorm': {
            'values': [False]  # Whether to use batch normalization
        },
        'dropout_rate': {
            'values': [0.2,0.0]  # Dropout rates
        },
        'dense_neurons': {
            'values': [128]  # Number of neurons in dense layer
        },
        'learning_rate': {
            'values': [0.0001]  # Learning rates
        },
        'epochs': {
            'value': 10  # Fixed number of epochs for all runs
        }
    }
}

# Initialize wandb
wandb.login(key="2b8654ea1d7143307fd59d1ea1bda5bc9f6fef77")

# Create the sweep
sweep_id = wandb.sweep(sweep_config, project="da6401_assignment2")

# Run the sweep (limit to 20 runs for efficiency)
wandb.agent(sweep_id, entity="cs24m048-iit-madras", project="da6401_assignment2" , function=train_with_wandb, count=2)


In [4]:
!gdown --id 15ko_kLn_UQO6tZzrG1IWnNbZzOC-uAdw
!unzip -q nature_12K.zip -d nature_12k

Downloading...
From (original): https://drive.google.com/uc?id=15ko_kLn_UQO6tZzrG1IWnNbZzOC-uAdw
From (redirected): https://drive.google.com/uc?id=15ko_kLn_UQO6tZzrG1IWnNbZzOC-uAdw&confirm=t&uuid=fc3bcaf7-a259-4889-8502-b00416eee889
To: /kaggle/working/nature_12K.zip
100%|███████████████████████████████████████| 3.82G/3.82G [00:34<00:00, 111MB/s]
replace nature_12k/inaturalist_12K/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [14]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [39]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.cuda import amp
from tqdm import tqdm
import numpy as np
import math

# Free up GPU memory
torch.cuda.empty_cache()

# Set PyTorch memory allocation config to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Configuration
BATCH_SIZE = 16  # Reduced batch size to avoid OOM
EPOCHS = 20
LEARNING_RATE = 0.0001
IMAGE_SIZE = 400  # Reduced image size to save memory
NUM_WORKERS = 4
ACCUMULATION_STEPS = 4  # Gradient accumulation steps
PATIENCE = 5  # Early stopping patience
DATASET_PATH = "/kaggle/working/nature_12k/inaturalist_12K"  # Update with your dataset path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f} MB")
    print(f"Memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f} MB")

# Custom transform to handle image orientation
class RotateIfNeeded:
    def __call__(self, img):
        width, height = img.size
        if height > width:
            return img.transpose(2)  # Rotate 90 degrees if portrait
        return img

def center_crop_image(img, target_size=(600, 600)):
    """Crop the center of the image to the target size."""
    width, height = img.size
    target_width, target_height = target_size
    left = max(0, (width - target_width) // 2)
    top = max(0, (height - target_height) // 2)
    right = left + target_width
    bottom = top + target_height
    return img.crop((left, top, right, bottom))
# Define data transforms
train_transform = transforms.Compose([
    RotateIfNeeded(),
    transforms.Lambda(lambda img: center_crop_image(img, target_size=(IMAGE_SIZE, IMAGE_SIZE))),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    RotateIfNeeded(),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load datasets
print(f"Loading datasets from {DATASET_PATH}")
train_dataset = datasets.ImageFolder(DATASET_PATH + "/train", transform=train_transform)
val_dataset = datasets.ImageFolder(DATASET_PATH + "/val", transform=val_transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

# Get number of classes
num_classes = len(train_dataset.classes)
print(f"Number of classes: {num_classes}")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Class names: {train_dataset.classes}")

# Load EfficientNet V2 Small model with pre-trained weights (smaller than Large to save memory)
model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)

# Replace classifier with one matching iNaturalist classes
in_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(in_features, num_classes)

# Move model to GPU
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Initialize mixed precision scaler
scaler = amp.GradScaler()

# Create directory for saving models
os.makedirs("models", exist_ok=True)

# Training loop
best_val_accuracy = 0.0
early_stopping_counter = 0

for epoch in range(EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} Training")
    
    for i, (inputs, labels) in enumerate(progress_bar):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Use mixed precision
        with amp.autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels) / ACCUMULATION_STEPS
        
        # Scale gradients and backpropagate
        scaler.scale(loss).backward()
        
        # Update weights after accumulating gradients
        if (i + 1) % ACCUMULATION_STEPS == 0 or (i + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        # Track statistics (multiply by accumulation steps to get true loss)
        train_loss += loss.item() * inputs.size(0) * ACCUMULATION_STEPS
        _, predicted = torch.max(outputs, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({
            "loss": f"{loss.item() * ACCUMULATION_STEPS:.4f}",
            "accuracy": f"{100.0 * train_correct / train_total:.2f}%"
        })
    
    train_loss = train_loss / len(train_dataset)
    train_accuracy = 100.0 * train_correct / train_total
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} Validation")
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # For validation, we don't need mixed precision
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Track statistics
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
            
            # Update progress bar
            progress_bar.set_postfix({
                "loss": f"{loss.item():.4f}",
                "accuracy": f"{100.0 * val_correct / val_total:.2f}%"
            })
    
    val_loss = val_loss / len(val_dataset)
    val_accuracy = 100.0 * val_correct / val_total
    
    # Print epoch summary
    print(f"Epoch {epoch+1}/{EPOCHS} - "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")
    
    # Save the best model
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), os.path.join("models", "best_model.pth"))
        print(f"New best model saved with validation accuracy: {best_val_accuracy:.2f}%")
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        print(f"EarlyStopping: {early_stopping_counter}/{PATIENCE}")
        
        # Implement early stopping
        if early_stopping_counter >= PATIENCE:
            print("Early stopping triggered")
            break
    
    # Save checkpoint
    checkpoint_path = os.path.join("models", f"checkpoint_epoch_{epoch+1}.pth")
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "train_loss": train_loss,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

# Load the best model for final evaluation
print("Loading best model for final evaluation...")
model.load_state_dict(torch.load(os.path.join("models", "best_model.pth")))
model.eval()

# Perform final evaluation
val_correct = 0
val_total = 0

print("Final Evaluation")
progress_bar = tqdm(val_loader, desc="Final evaluation")

with torch.no_grad():
    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        val_total += labels.size(0)
        val_correct += (predicted == labels).sum().item()
        
        # Update progress bar
        progress_bar.set_postfix({"accuracy": f"{100.0 * val_correct / val_total:.2f}%"})

final_accuracy = 100.0 * val_correct / val_total
print(f"Final validation accuracy: {final_accuracy:.2f}%")
print(f"Best validation accuracy: {best_val_accuracy:.2f}%")

# Save class mapping
class_mapping = {i: cls for i, cls in enumerate(train_dataset.classes)}
print(f"Class mapping: {class_mapping}")

print("Training completed successfully!")


Using device: cuda
GPU: Tesla T4
Memory allocated: 351.7 MB
Memory cached: 742.0 MB
Loading datasets from /kaggle/working/nature_12k/inaturalist_12K
Number of classes: 10
Number of training samples: 9999
Number of validation samples: 2000
Class names: ['Amphibia', 'Animalia', 'Arachnida', 'Aves', 'Fungi', 'Insecta', 'Mammalia', 'Mollusca', 'Plantae', 'Reptilia']


  scaler = amp.GradScaler()
  with amp.autocast():
Epoch 1/20 Training: 100%|██████████| 625/625 [02:19<00:00,  4.48it/s, loss=0.5315, accuracy=75.03%]
Epoch 1/20 Validation: 100%|██████████| 125/125 [00:17<00:00,  7.25it/s, loss=0.1300, accuracy=89.10%]


Epoch 1/20 - Train Loss: 0.8898, Train Acc: 75.03%, Val Loss: 0.3625, Val Acc: 89.10%
New best model saved with validation accuracy: 89.10%
Checkpoint saved to models/checkpoint_epoch_1.pth


Epoch 2/20 Training: 100%|██████████| 625/625 [02:17<00:00,  4.53it/s, loss=0.1385, accuracy=89.43%]
Epoch 2/20 Validation: 100%|██████████| 125/125 [00:17<00:00,  7.33it/s, loss=0.3115, accuracy=89.90%]


Epoch 2/20 - Train Loss: 0.3486, Train Acc: 89.43%, Val Loss: 0.3521, Val Acc: 89.90%
New best model saved with validation accuracy: 89.90%
Checkpoint saved to models/checkpoint_epoch_2.pth


Epoch 3/20 Training: 100%|██████████| 625/625 [02:18<00:00,  4.53it/s, loss=0.3926, accuracy=92.58%]
Epoch 3/20 Validation: 100%|██████████| 125/125 [00:17<00:00,  7.30it/s, loss=0.2655, accuracy=89.20%]


Epoch 3/20 - Train Loss: 0.2434, Train Acc: 92.58%, Val Loss: 0.3413, Val Acc: 89.20%
EarlyStopping: 1/5
Checkpoint saved to models/checkpoint_epoch_3.pth


Epoch 4/20 Training:  83%|████████▎ | 520/625 [01:55<00:23,  4.50it/s, loss=0.5478, accuracy=94.98%]


KeyboardInterrupt: 

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
from torch.cuda import amp
from tqdm import tqdm
import numpy as np
import wandb
from sklearn.model_selection import StratifiedShuffleSplit

# Free up GPU memory
torch.cuda.empty_cache()

# Set PyTorch memory allocation config
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f} MB")
    print(f"Memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f} MB")

# Custom transform to handle image orientation
class RotateIfNeeded:
    def __call__(self, img):
        width, height = img.size
        if height > width:
            return img.transpose(2)  # Rotate 90 degrees if portrait
        return img

def center_crop_image(img, target_size=(400, 400)):
    """Crop the center of the image to the target size."""
    width, height = img.size
    target_width, target_height = target_size
    left = max(0, (width - target_width) // 2)
    top = max(0, (height - target_height) // 2)
    right = left + target_width
    bottom = top + target_height
    return img.crop((left, top, right, bottom))

def count_parameters(model):
    """Count the number of trainable and non-trainable parameters"""
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable, non_trainable

def freeze_all_layers_except_last(model):
    """Freeze all layers except the last fully connected layer"""
    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False
    
    # Unfreeze the last layer (classifier)
    for param in model.classifier[1].parameters():
        param.requires_grad = True
    
    return model

def freeze_first_k_layers(model, k):
    """Freeze first K layers, train the rest"""
    # First, unfreeze all parameters
    for param in model.parameters():
        param.requires_grad = True
    
    # Get list of all layers in features
    features = list(model.features)
    
    # Freeze the first k layers
    for i in range(min(k, len(features))):
        for param in features[i].parameters():
            param.requires_grad = False
    
    return model

def no_freezing(model):
    """Train all layers (no freezing)"""
    for param in model.parameters():
        param.requires_grad = True
    return model

# Function that will be called by wandb sweep
def train_model_sweep(config=None):
    with wandb.init(config=config):
        # Access all hyperparameters through wandb.config
        config = wandb.config
        
        # Data transformations
        train_transform = transforms.Compose([
            RotateIfNeeded(),
            transforms.Lambda(lambda img: center_crop_image(img, target_size=(config.image_size, config.image_size))),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        val_transform = transforms.Compose([
            RotateIfNeeded(),
            transforms.Lambda(lambda img: center_crop_image(img, target_size=(config.image_size, config.image_size))),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        
        test_transform = transforms.Compose([
            RotateIfNeeded(),
            transforms.Lambda(lambda img: center_crop_image(img, target_size=(config.image_size, config.image_size))),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        
        # Load full training dataset
        print(f"Loading datasets from {config.dataset_path}")
        full_train_dataset = datasets.ImageFolder(config.dataset_path + "/train", transform=train_transform)
        
        # Get targets for stratified split
        targets = np.array([label for _, label in full_train_dataset.samples])
        
        # Create stratified split - 80% train, 20% validation
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_idx, val_idx = next(sss.split(np.zeros(len(targets)), targets))
        
        # Verify the class distribution
        train_labels = targets[train_idx]
        val_labels = targets[val_idx]
        unique_labels = np.unique(targets)
        
        print("\nClass distribution:")
        for label in unique_labels:
            train_count = np.sum(train_labels == label)
            val_count = np.sum(val_labels == label)
            total_count = np.sum(targets == label)
            print(f"Class {full_train_dataset.classes[label]}: "
                  f"Train {train_count}/{total_count} ({train_count/total_count:.2f}), "
                  f"Val {val_count}/{total_count} ({val_count/total_count:.2f})")
        
        # Create Subset objects for train and validation
        train_subset = Subset(full_train_dataset, train_idx)
        val_subset = Subset(full_train_dataset, val_idx)
        
        # Load the test dataset (using the val folder)
        test_dataset = datasets.ImageFolder(config.dataset_path + "/val", transform=test_transform)
        
        # Create data loaders
        train_loader = DataLoader(
            train_subset, 
            batch_size=config.batch_size, 
            shuffle=True, 
            num_workers=config.num_workers, 
            pin_memory=True
        )
        
        val_loader = DataLoader(
            val_subset, 
            batch_size=config.batch_size, 
            shuffle=False, 
            num_workers=config.num_workers, 
            pin_memory=True
        )
        
        test_loader = DataLoader(
            test_dataset, 
            batch_size=config.batch_size, 
            shuffle=False, 
            num_workers=config.num_workers, 
            pin_memory=True
        )
        
        # Get number of classes
        num_classes = len(full_train_dataset.classes)
        print(f"\nNumber of classes: {num_classes}")
        print(f"Full training dataset: {len(full_train_dataset)} images")
        print(f"Training subset: {len(train_subset)} images ({len(train_subset)/len(full_train_dataset):.1%})")
        print(f"Validation subset: {len(val_subset)} images ({len(val_subset)/len(full_train_dataset):.1%})")
        print(f"Test dataset: {len(test_dataset)} images")

        # Load model
        model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)
        
        # Replace classifier
        in_features = model.classifier[1].in_features
        model.classifier[1] = nn.Linear(in_features, num_classes)
        
        # Apply freezing strategy based on config
        if config.freezing_strategy == "freeze_all_except_last":
            model = freeze_all_layers_except_last(model)
        elif config.freezing_strategy == "freeze_first_k_layers":
            model = freeze_first_k_layers(model, config.freeze_k_layers)
        elif config.freezing_strategy == "no_freezing":
            model = no_freezing(model)
        else:
            raise ValueError(f"Unknown freezing strategy: {config.freezing_strategy}")
        
        # Count parameters
        trainable_params, non_trainable_params = count_parameters(model)
        total_params = trainable_params + non_trainable_params
        percent_trainable = 100 * trainable_params / total_params
        
        print(f"\nTrainable parameters: {trainable_params:,} ({percent_trainable:.2f}%)")
        print(f"Non-trainable parameters: {non_trainable_params:,} ({100-percent_trainable:.2f}%)")
        print(f"Total parameters: {total_params:,}")
        
        # Log parameter counts to wandb
        wandb.log({
            "trainable_params": trainable_params,
            "non_trainable_params": non_trainable_params,
            "percent_trainable": percent_trainable
        })
        
        # Move model to device
        model = model.to(device)
        
        # Define loss function and optimizer (only train parameters that require gradients)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=config.learning_rate
        )
        
        # Initialize mixed precision scaler
        scaler = amp.GradScaler()
        
        # Create directory for saving models
        os.makedirs(f"models/{wandb.run.id}", exist_ok=True)
        
        # Training loop - fixed at 10 epochs, no early stopping
        best_val_accuracy = 0.0
        
        for epoch in range(config.epochs):
            # Training phase
            model.train()
            train_loss = 0.0
            train_correct = 0
            train_total = 0
            optimizer.zero_grad()
            
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.epochs} Training")
            
            for i, (inputs, labels) in enumerate(progress_bar):
                inputs, labels = inputs.to(device), labels.to(device)
                
                # Use mixed precision
                with amp.autocast():
                    outputs = model(inputs)
                    loss = criterion(outputs, labels) / config.accumulation_steps
                
                # Scale gradients and backpropagate
                scaler.scale(loss).backward()
                
                # Update weights after accumulating gradients
                if (i + 1) % config.accumulation_steps == 0 or (i + 1) == len(train_loader):
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                
                # Track statistics
                train_loss += loss.item() * inputs.size(0) * config.accumulation_steps
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
                
                # Update progress bar
                progress_bar.set_postfix({
                    "loss": f"{loss.item() * config.accumulation_steps:.4f}",
                    "accuracy": f"{100.0 * train_correct / train_total:.2f}%"
                })
            
            train_loss = train_loss / len(train_subset)
            train_accuracy = 100.0 * train_correct / train_total
            
            # Validation phase
            model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                progress_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{config.epochs} Validation")
                for inputs, labels in progress_bar:
                    inputs, labels = inputs.to(device), labels.to(device)
                    
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    
                    # Track statistics
                    val_loss += loss.item() * inputs.size(0)
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
                    
                    # Update progress bar
                    progress_bar.set_postfix({
                        "loss": f"{loss.item():.4f}",
                        "accuracy": f"{100.0 * val_correct / val_total:.2f}%"
                    })
            
            val_loss = val_loss / len(val_subset)
            val_accuracy = 100.0 * val_correct / val_total
            
            # Print epoch summary
            print(f"Epoch {epoch+1}/{config.epochs} - "
                  f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")
            
            # Log to wandb
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": train_loss,
                "train_accuracy": train_accuracy,
                "val_loss": val_loss,
                "val_accuracy": val_accuracy
            })
            
            # Save the best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                torch.save(model.state_dict(), os.path.join(f"models/{wandb.run.id}", "best_model.pth"))
                print(f"New best model saved with validation accuracy: {best_val_accuracy:.2f}%")
        
        # Load the best model for final evaluation on test set
        print("\nLoading best model for final test evaluation...")
        model.load_state_dict(torch.load(os.path.join(f"models/{wandb.run.id}", "best_model.pth")))
        model.eval()
        
        # Perform test evaluation
        test_loss = 0.0
        test_correct = 0
        test_total = 0
        
        print("Test Evaluation")
        progress_bar = tqdm(test_loader, desc="Testing on val folder (test set)")
        
        with torch.no_grad():
            for inputs, labels in progress_bar:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                test_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
                
                # Update progress bar
                progress_bar.set_postfix({
                    "loss": f"{loss.item():.4f}",
                    "accuracy": f"{100.0 * test_correct / test_total:.2f}%"
                })
        
        test_loss = test_loss / len(test_dataset)
        test_accuracy = 100.0 * test_correct / test_total
        
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
        print(f"Best validation accuracy: {best_val_accuracy:.2f}%")
        
        # Log final metrics to wandb
        wandb.log({
            "test_loss": test_loss,
            "test_accuracy": test_accuracy,
            "best_val_accuracy": best_val_accuracy
        })

# Define sweep configuration
sweep_config = {
    'method': 'grid',  # Try all combinations since we have a focused set of strategies
    'metric': {
        'name': 'test_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'freezing_strategy': {
            'values': ['freeze_all_except_last', 'freeze_first_k_layers', 'no_freezing']
        },
        'freeze_k_layers': {
            'value': 3  # Only used when freezing_strategy is 'freeze_first_k_layers'
        },
        'learning_rate': {
            'value': 0.0001
        },
        'batch_size': {
            'value': 16
        },
        'epochs': {
            'value': 10  # Fixed at 10 epochs as requested
        },
        'image_size': {
            'value': 400  # Sweet spot as per your findings
        },
        'num_workers': {
            'value': 4
        },
        'accumulation_steps': {
            'value': 4
        },
        'dataset_path': {
            'value': '/kaggle/working/nature_12k/inaturalist_12K'  # Update with your dataset path
        }
    }
}

wandb.login(key="2b8654ea1d7143307fd59d1ea1bda5bc9f6fef77")
# Initialize wandb sweep
sweep_id = wandb.sweep(sweep_config, project="da6401_assignment2")

# Run the sweep agent
wandb.agent(sweep_id, train_model_sweep)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Using device: cuda
GPU: Tesla T4
Memory allocated: 16.2 MB
Memory cached: 50.0 MB
Create sweep with ID: mgg0mjal
Sweep URL: https://wandb.ai/cs24m048-iit-madras/da6401_assignment2/sweeps/mgg0mjal


[34m[1mwandb[0m: Agent Starting Run: s7ic27w0 with config:
[34m[1mwandb[0m: 	accumulation_steps: 4
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dataset_path: /kaggle/working/nature_12k/inaturalist_12K
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	freeze_k_layers: 3
[34m[1mwandb[0m: 	freezing_strategy: freeze_all_except_last
[34m[1mwandb[0m: 	image_size: 400
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	num_workers: 4


Loading datasets from /kaggle/working/nature_12k/inaturalist_12K

Class distribution:
Class Amphibia: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Animalia: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Arachnida: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Aves: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Fungi: Train 799/999 (0.80), Val 200/999 (0.20)
Class Insecta: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Mammalia: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Mollusca: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Plantae: Train 800/1000 (0.80), Val 200/1000 (0.20)
Class Reptilia: Train 800/1000 (0.80), Val 200/1000 (0.20)

Number of classes: 10
Full training dataset: 9999 images
Training subset: 7999 images (80.0%)
Validation subset: 2000 images (20.0%)
Test dataset: 2000 images


  scaler = amp.GradScaler()



Trainable parameters: 12,810 (0.06%)
Non-trainable parameters: 20,177,488 (99.94%)
Total parameters: 20,190,298


  with amp.autocast():
Epoch 1/10 Training: 100%|██████████| 500/500 [00:57<00:00,  8.65it/s, loss=1.8327, accuracy=44.63%]
Epoch 1/10 Validation: 100%|██████████| 125/125 [00:23<00:00,  5.39it/s, loss=1.8260, accuracy=72.60%]


Epoch 1/10 - Train Loss: 2.0474, Train Acc: 44.63%, Val Loss: 1.8371, Val Acc: 72.60%
New best model saved with validation accuracy: 72.60%


Epoch 2/10 Training: 100%|██████████| 500/500 [00:56<00:00,  8.87it/s, loss=1.4532, accuracy=69.97%]
Epoch 2/10 Validation: 100%|██████████| 125/125 [00:23<00:00,  5.33it/s, loss=1.5742, accuracy=80.60%]


Epoch 2/10 - Train Loss: 1.6387, Train Acc: 69.97%, Val Loss: 1.5044, Val Acc: 80.60%
New best model saved with validation accuracy: 80.60%


Epoch 3/10 Training: 100%|██████████| 500/500 [00:56<00:00,  8.91it/s, loss=1.2314, accuracy=74.78%]
Epoch 3/10 Validation: 100%|██████████| 125/125 [00:23<00:00,  5.30it/s, loss=1.3288, accuracy=81.95%]


Epoch 3/10 - Train Loss: 1.3547, Train Acc: 74.78%, Val Loss: 1.2514, Val Acc: 81.95%
New best model saved with validation accuracy: 81.95%


Epoch 4/10 Training: 100%|██████████| 500/500 [00:55<00:00,  9.00it/s, loss=0.7072, accuracy=76.30%]
Epoch 4/10 Validation:   3%|▎         | 4/125 [00:01<00:30,  4.01it/s, loss=1.1168, accuracy=81.25%]