In [1]:
import wandb
import glob
import torch.optim as optim

import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import ConcatDataset
from torchsummary import summary
import gc
import pickle
import csv
from datetime import datetime

In [2]:
# key: ed118af0347d47519a2b9d68527b0032bcf73907
wandb.login(key='ed118af0347d47519a2b9d68527b0032bcf73907')

import wandb

# Define parameters
name = f"resnet_experiment_{datetime.now().strftime("%Y%m%d_%H%M%S")}"
num_epochs = 300
warmup_epochs = 10          # number of epochs to warm up the learning rate
batch_size = 64
has_checkpoint = False
learning_rate = 0.02  # initial learning rate
augmentation_ratio = 0.6  # percentage of data to be augmented (60%)
train_val_ratio = 0.8  # percentage of data to be considered for training
block_size = [3, 3, 4, 3]
dropout_prob = 0.3
use_se = True
se_reduction = 8 # dictates how much the channel dimension is compressed. The lower it is, the lesser the compression and more nuances learnt
MIXUP_ALPHA = 0.3         # Mixup Beta distribution parameter
CUTMIX_ALPHA = 1.0        # CutMix Beta distribution parameter
ADVANCED_MIXUP_PROB = 0.5   # Probability of applying CutMix vs. Mixup
weight_decay = 0.0005
momentum = 0.9


# Dynamically populate the architecture description string
architecture_description = (
    f"ResNet with layers {block_size}, "
    f"warmup {warmup_epochs} epochs, "
    f"total {num_epochs} epochs, "
    f"{int(augmentation_ratio * 100)}%-{int((1 - augmentation_ratio) * 100)}% augmentation split, "
    f"{int(train_val_ratio * 100)}-{int((1 - train_val_ratio) * 100)} train-val split, "
    f"batch size {batch_size}, "
    f"dropout {dropout_prob}, "
    f"Squeeze-and-Excite {'enabled' if use_se else 'disabled'} (reduction={se_reduction}), "
    f"momentum {momentum}, "
    f"weight decay {weight_decay}"
    f"MIXUP_ALPHA {MIXUP_ALPHA}"
    f"CUTMIX_ALPHA {CUTMIX_ALPHA}"
    "implemented cutmix and normal mixup, squeeze and excite"
)

# Initialize wandb with the dynamic config
wandb.init(
    project="DL_project1",
    name=name,
    config={
        "num_epochs": num_epochs,
        "warmup_epochs": warmup_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "augmentation_ratio": augmentation_ratio,
        "train_val_ratio": train_val_ratio,
        "block_size": block_size,
        "dropout_prob": dropout_prob,
        "use_se": use_se,
        "se_reduction": se_reduction,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "architecture": architecture_description,
        "dataset": "CIFAR10"
    }
)


config = wandb.config

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pb3073/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpb3073[0m ([33mpb3073-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Load the dataset
# Defining Transformers for train and test set differently
# Load the dataset with training transforms
# train_transform = transforms.Compose([
#     transforms.RandomRotation(5),
#     transforms.RandomHorizontalFlip(0.5),
#     transforms.RandomCrop(32, padding=2),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
#                          std=[0.2023, 0.1994, 0.2010])
# ])
train_transform = transforms.Compose([
    transforms.RandomRotation(5),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomCrop(32, padding=4),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10),
    #transforms.RandomGrayscale(p=0.1),
    transforms.RandomAdjustSharpness(sharpness_factor=2, p=0.3),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.3, scale=(0.1, 0.2), ratio=(0.5, 2.0), value="random"),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                         std=[0.2023, 0.1994, 0.2010])
])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                         std=[0.2023, 0.1994, 0.2010])
])

# Download the full training set (will be split into train and validation)
full_train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)

# Determine sizes for training and validation splits
validation_split = 0.15
val_size = int(len(full_train_dataset) * validation_split)
train_size = len(full_train_dataset) - val_size

# Split the dataset
train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [train_size, val_size])

# Define test dataset separately
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
valid_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [4]:
print(len(train_loader))
print(len(valid_loader))

665
118


In [5]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x151902399e20>


In [6]:
class SqueezeExcite(nn.Module):
    def __init__(self, num_channels, reduction=8):
        """
        Squeeze-and-Excite block.

        Args:
            num_channels (int): Number of input channels.
            reduction (int): Reduction ratio for the bottleneck. Default: 8.
        """
        super(SqueezeExcite, self).__init__()
        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc1 = nn.Conv2d(num_channels, num_channels // reduction, kernel_size=1, bias=True)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv2d(num_channels // reduction, num_channels, kernel_size=1, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Squeeze: Global average pooling
        se = self.global_avg_pool(x)
        # Excitation: two FC layers with ReLU and Sigmoid activations
        se = self.fc1(se)
        se = self.relu(se)
        se = self.fc2(se)
        se = self.sigmoid(se)
        # Scale: multiply the original input with the learned channel weights
        return x * se

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None,
                 use_se=False, se_reduction=8):
        """
        Residual block with an optional Squeeze-and-Excite module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            stride (int): Stride for the first convolution. Default: 1.
            downsample (nn.Module or None): Downsampling layer if needed.
            use_se (bool): Whether to include the SE block. Default: False.
            se_reduction (int): Reduction ratio for the SE block. Default: 8.
        """
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

        self.use_se = use_se
        if self.use_se:
            self.se = SqueezeExcite(out_channels, reduction=se_reduction)

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)  # First activation

        out = self.conv2(out)
        out = self.bn2(out)

        # Optionally apply Squeeze-and-Excite
        if self.use_se:
            out = self.se(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)  # Second activation

        return out

In [7]:
import torch.nn.functional as F

class ResNetWithDropout(nn.Module):
    def __init__(self, block, layers, num_classes=10, dropout_prob=0.5, use_se=False, se_reduction=8):
        """
        ResNet with dropout and optional SE blocks.

        Args:
            block: Residual block type (e.g., ResidualBlock).
            layers (list): Number of blocks in each layer.
            num_classes (int): Number of output classes.
            dropout_prob (float): Dropout probability before the final FC layer.
            use_se (bool): Whether to include SE blocks in each residual block.
            se_reduction (int): Reduction ratio for SE blocks.
        """
        super(ResNetWithDropout, self).__init__()
        self.use_se = use_se
        self.se_reduction = se_reduction
        self.inplanes = 32
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True)
        )
        self.layer1 = self._make_layer(block, 32, layers[0], stride=1)
        self.layer2 = self._make_layer(block, 64, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 128, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 256, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(256, num_classes)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes),
            )
        layers = []
        # Pass the external SE configuration to the block
        layers.append(block(self.inplanes, planes, stride, downsample,
                            use_se=self.use_se, se_reduction=self.se_reduction))
        self.inplanes = planes
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, use_se=self.use_se, se_reduction=self.se_reduction))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)  # Apply dropout before the final FC layer
        x = self.fc(x)
        return x

# Example usage with label smoothing in the loss function:
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingCrossEntropy, self).__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        confidence = 1.0 - self.smoothing
        logprobs = F.log_softmax(pred, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

import torch.nn.init as init

def initialize_weights(m):
    if isinstance(m, nn.Conv2d):
        # Kaiming normal initialization for Conv2d layers
        init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            init.constant_(m.bias, 0)
    elif isinstance(m, nn.Linear):
        # Kaiming normal initialization for Linear layers
        init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            init.constant_(m.bias, 0)
    elif isinstance(m, nn.BatchNorm2d):
        # Initialize BatchNorm weights and biases
        init.constant_(m.weight, 1)
        init.constant_(m.bias, 0)


# Instantiate the improved model
model = ResNetWithDropout(ResidualBlock, block_size, num_classes=10, dropout_prob=dropout_prob, use_se=use_se, se_reduction=se_reduction).to(device)
# He Initialization
model.apply(initialize_weights)

criterion = LabelSmoothingCrossEntropy(smoothing=0.1)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)

In [8]:
import math

# Instantiate the model
# 3, 3, 6, 3
# model = ResNet(ResidualBlock, [2, 2, 2, 2]).to(device)
# criterion = nn.CrossEntropyLoss()

# # [CHANGED] Original optimizer using SGD with momentum
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9)

# -------------------------------
# [CHANGED] Warmup and Cosine Annealing Scheduler Setup
# -------------------------------
total_epochs = num_epochs  # total number of epochs in training
start_epoch = 0

def lr_lambda(epoch):
    # Warmup phase: linearly increase LR during the first 'warmup_epochs'
    if epoch < warmup_epochs:
        return float(epoch + 1) / warmup_epochs
    # Cosine annealing after warmup
    else:
        # Adjust epoch number for cosine annealing starting at 0 after warmup
        cosine_epoch = epoch - warmup_epochs
        cosine_total = total_epochs - warmup_epochs
        return 0.5 * (1 + math.cos(math.pi * cosine_epoch / cosine_total))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

In [9]:
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 32, 32]             864
       BatchNorm2d-2           [-1, 32, 32, 32]              64
              ReLU-3           [-1, 32, 32, 32]               0
            Conv2d-4           [-1, 32, 32, 32]           9,216
       BatchNorm2d-5           [-1, 32, 32, 32]              64
              ReLU-6           [-1, 32, 32, 32]               0
            Conv2d-7           [-1, 32, 32, 32]           9,216
       BatchNorm2d-8           [-1, 32, 32, 32]              64
 AdaptiveAvgPool2d-9             [-1, 32, 1, 1]               0
           Conv2d-10              [-1, 4, 1, 1]             132
             ReLU-11              [-1, 4, 1, 1]               0
           Conv2d-12             [-1, 32, 1, 1]             160
          Sigmoid-13             [-1, 32, 1, 1]               0
    SqueezeExcite-14           [-1, 32,

In [10]:
import os

# Define checkpoint saving interval
save_interval = 5  # Save every 5 epochs
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)  # Create directory if it doesn't exist

In [11]:
has_checkpoint = False
if has_checkpoint == True:
  checkpoint = torch.load("checkpoints/model_epoch_110.pth")  # Adjust to the last saved epoch
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  start_epoch = checkpoint['epoch']
  print(f"Resuming from epoch {start_epoch}")


In [12]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
NVIDIA A100-SXM4-40GB


In [13]:
def rand_bbox(size, lam):
    """
    Generate random bounding box coordinates for CutMix.

    Args:
        size (tuple): Size of the input tensor (batch, channels, width, height).
        lam (float): Mixing ratio (lambda).

    Returns:
        Tuple of bounding box coordinates (bbx1, bby1, bbx2, bby2).
    """
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)  # Compute cut ratio
    cut_w = np.int64(W * cut_rat)
    cut_h = np.int64(H * cut_rat)
    
    # Random center point for the bounding box
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    
    # Calculate bounding box coordinates and clip them to the image boundaries
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    
    return bbx1, bby1, bbx2, bby2

# ============================================
# CutMix Augmentation Function
# ============================================
def cutmix_data(x, y, alpha=CUTMIX_ALPHA):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0

    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)
    y_a, y_b = y, y[index]

    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)

    # Create a boolean mask for the region to replace.
    mask = torch.ones(x.size(), dtype=torch.bool, device=x.device)
    mask[:, :, bbx1:bbx2, bby1:bby2] = False

    # Use torch.where to combine images
    x = torch.where(mask, x, x[index])
    
    # Adjust lambda to reflect the actual area replaced
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(-1) * x.size(-2)))
    return x, y_a, y_b, lam



# ============================================
# Mixup Augmentation Function
# ============================================
def mixup_data(x, y, alpha=MIXUP_ALPHA):
    """
    Applies Mixup augmentation by linear interpolation between pairs of images.

    Args:
        x (Tensor): Input images.
        y (Tensor): Labels.
        alpha (float): Beta distribution parameter.

    Returns:
        mixed_x (Tensor): Mixed images.
        y_a (Tensor): Original labels.
        y_b (Tensor): Shuffled labels.
        lam (float): Mixup coefficient.
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

# ============================================
# Mixup Criterion Function
# ============================================
def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """
    Computes the mixup loss as a linear combination of the losses on the mixed pairs.

    Args:
        criterion: Loss function.
        pred (Tensor): Model predictions.
        y_a (Tensor): Original labels.
        y_b (Tensor): Shuffled labels.
        lam (float): Mixup coefficient.

    Returns:
        A scalar loss value.
    """
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# ============================================
# Advanced Mixup Function
# ============================================
def advanced_mixup_data(x, y):
    """
    Applies either Mixup or CutMix augmentation based on a predefined probability.

    Returns:
        Mixed inputs, paired labels, and mixup coefficient.
    """
    if np.random.rand() < ADVANCED_MIXUP_PROB:
        return cutmix_data(x, y, alpha=CUTMIX_ALPHA)
    else:
        return mixup_data(x, y, alpha=MIXUP_ALPHA)

In [14]:
# Assuming mixup_data and mixup_criterion functions are defined as shown earlier.

total_step = len(train_loader)
last = 0
for epoch in range(start_epoch, num_epochs):
    model.train()
    running_loss = 0.0
    running_correct = 0       # Accumulate weighted correct predictions for training accuracy.
    running_total = 0         # Accumulate total examples.

    for i, (images, labels) in enumerate(train_loader):
        # Move tensors to the configured device using non_blocking transfer
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        
        # Apply mixup on the batch
        mixed_images, targets_a, targets_b, lam = advanced_mixup_data(images, labels)
        # Forward pass with the mixed images
        outputs = model(mixed_images)
        # Compute mixup loss
        loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # For mixup, computing "accuracy" is not straightforward.
        # One approximation is to compute a weighted accuracy:
        _, predicted = torch.max(outputs, 1)
        correct_a = (predicted == targets_a).sum().item()
        correct_b = (predicted == targets_b).sum().item()
        # Weighted sum of correct predictions using mixup lambda.
        running_correct += lam * correct_a + (1 - lam) * correct_b
        running_total += labels.size(0)
        running_loss += loss.item()

        # Log training loss for the batch (logging remains unchanged)
        wandb.log({"train/loss": loss.item(), "epoch": epoch, "batch": i})

    # Calculate average training loss and approximate training accuracy for the epoch.
    avg_loss = running_loss / total_step
    train_accuracy = 100 * running_correct / running_total
    print('Epoch [{}/{}], Loss: {:.4f}, Train Accuracy: {:.2f}%'.format(
        epoch+1, num_epochs, avg_loss, train_accuracy))

    # Validation after each epoch (use original images, no mixup during validation)
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss_val = criterion(outputs, labels)
            val_loss += loss_val.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    avg_val_loss = val_loss / len(valid_loader)
    val_accuracy = 100 * correct / total
    print('Validation Accuracy: {:.2f} %'.format(val_accuracy))

    if (epoch + 1) % save_interval == 0:
        checkpoint_path = f"{checkpoint_dir}/model_epoch_{epoch+1}.pth"
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
            'val_accuracy': val_accuracy
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")
    last = epoch + 1

    # Step the scheduler at the end of each epoch to update the learning rate
    scheduler.step()
    current_lr = scheduler.get_last_lr()[0]  # Capture current learning rate.
    print(f"Epoch {epoch+1} completed. Current LR: {current_lr}")

    # Log aggregated metrics for the epoch to wandb.
    wandb.log({
        "train/avg_loss": avg_loss,
        "train/accuracy": train_accuracy,
        "val/avg_loss": avg_val_loss,
        "val/accuracy": val_accuracy,
        "lr": current_lr,
        "epoch": epoch
    })


Epoch [1/300], Loss: 2.3524, Train Accuracy: 13.72%
Validation Accuracy: 20.91 %
Epoch 1 completed. Current LR: 0.004
Epoch [2/300], Loss: 2.2005, Train Accuracy: 19.03%
Validation Accuracy: 26.16 %
Epoch 2 completed. Current LR: 0.006
Epoch [3/300], Loss: 2.1270, Train Accuracy: 22.84%
Validation Accuracy: 30.40 %
Epoch 3 completed. Current LR: 0.008
Epoch [4/300], Loss: 2.0705, Train Accuracy: 26.77%
Validation Accuracy: 35.87 %
Epoch 4 completed. Current LR: 0.01
Epoch [5/300], Loss: 2.0327, Train Accuracy: 29.23%
Validation Accuracy: 38.88 %
Checkpoint saved at checkpoints/model_epoch_5.pth
Epoch 5 completed. Current LR: 0.012
Epoch [6/300], Loss: 1.9630, Train Accuracy: 33.35%
Validation Accuracy: 38.45 %
Epoch 6 completed. Current LR: 0.013999999999999999
Epoch [7/300], Loss: 1.9124, Train Accuracy: 36.35%
Validation Accuracy: 48.43 %
Epoch 7 completed. Current LR: 0.016
Epoch [8/300], Loss: 1.8742, Train Accuracy: 38.58%
Validation Accuracy: 50.93 %
Epoch 8 completed. Current LR

In [15]:
# save the model checkpoints to wandb

# Find all checkpoint files that match the naming pattern
checkpoint_files = glob.glob(f"{name}_epoch_{last}.pth")

# Create a new artifact with the same name as the experiment
artifact = wandb.Artifact(name, type="model")

# Add each checkpoint file to the artifact
for cp in checkpoint_files:
    artifact.add_file(cp)
    print(f"Added {cp} to the artifact.")

# Log the artifact to wandb
wandb.log_artifact(artifact)
print(f"Artifact '{name}' logged to wandb!")

Artifact 'resnet_experiment_20250313_180158' logged to wandb!


In [16]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs
    test_accuracy = 100 * correct / total
    wandb.log({"test/accuracy": test_accuracy})
    print('Accuracy of the network on the {} test images: {} %'.format(10000, test_accuracy))

Accuracy of the network on the 10000 test images: 95.94 %


In [17]:
torch.save(model.state_dict(), "resnet_model.pth")
wandb.save("resnet_model.pth")
wandb.finish()

0,1
batch,▅▆█▅▂█▅▆▁▅▂▇▂▆▂▃▆▄▃▆▇▃▁▇█▁█▃▅▂▄█▅█▇▇▅▇▄▂
epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇██
lr,▆█████████▇▇▇▇▇▇▇▆▆▆▆▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁
test/accuracy,▁
train/accuracy,▁▄▄▅▅▅▅▅▆▅▅▆▅▆▆▆▆▆▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇█████
train/avg_loss,█▆▆▆▅▄▄▄▃▄▃▃▃▃▃▃▃▃▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
train/loss,▆▆▆▄█▇▅▇▄▆█▅▂█▄▄▂▅▅▄▆▂▆▅▆▆▁▁▂▂▅▆▄▆▃▂▁▆▄▆
val/accuracy,▁▃▄▄▅▆▅▆▆▆▆▆▇▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
val/avg_loss,█▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▃▂▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
batch,664.0
epoch,299.0
lr,0.0
test/accuracy,95.94
train/accuracy,73.4075
train/avg_loss,1.15436
train/loss,1.51346
val/accuracy,90.69333
val/avg_loss,0.78749


# Process Test Data

In [18]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [24]:
# Load the unpickled data
data_unpickle = unpickle('./data/cifar_test_nolabel.pkl')
data = data_unpickle[b'data']
print("Original data shape:", data.shape)  # Expect (N, 3072)

# Reshape the data: (N, 3072) -> (N, 32, 32, 3)
data = data.reshape(-1, 32, 32, 3).astype('float32') / 255.0

# Convert numpy array to a torch tensor
data = torch.tensor(data, dtype=torch.float32)

# Permute dimensions: from (N, H, W, C) to (N, C, H, W)
data = data.permute(0, 3, 1, 2)
print("Data shape after permutation:", data.shape)  # Expect (N, 3, 32, 32)

# Calculate the mean and standard deviation from the test data
# mean = data.mean(dim=(0, 2, 3), keepdim=True)
# std = data.std(dim=(0, 2, 3), keepdim=True)
mean = torch.tensor([0.4914, 0.4822, 0.4465], dtype=torch.float32).view(3,1,1)
std = torch.tensor([0.2023, 0.1994, 0.2010], dtype=torch.float32).view(3,1,1)

# Apply normalization: (x - mean) / std for each channel
data = (data - mean) / std

# Move data to device (ensure that 'device' is defined)
data = data.to(device)

# Load label names
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

print("Test data shape:", data.shape)

Original data shape: (10000, 32, 32, 3)
Data shape after permutation: torch.Size([10000, 3, 32, 32])
Test data shape: torch.Size([10000, 3, 32, 32])


In [25]:
# Run inference on custom test data
predicted_labels = []
model.eval()
with torch.no_grad():
    for i in range(len(data)):
        image = data[i].unsqueeze(0)  # Add batch dimension
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)
        predicted_labels.append(predicted.item())



In [26]:
# Save predicted labels to a CSV file
with open(f'./prediction_csv/{name}_orig_norm.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ID', 'Labels'])
    for idx, label in enumerate(predicted_labels):
        writer.writerow([idx, label])

print(f"Predicted labels saved to {name}.csv")

Predicted labels saved to resnet_experiment_20250313_180158.csv


In [22]:
from collections import Counter

label_counts = Counter(predicted_labels)

# Print label counts
for label, count in label_counts.items():
    print(f"{label_names[label]}({label}): {count}")

frog(6): 918
automobile(1): 1034
ship(8): 1046
truck(9): 1014
cat(3): 1041
airplane(0): 876
bird(2): 979
dog(5): 1110
horse(7): 1010
deer(4): 972


In [23]:
with open('reference_labels.csv', 'r') as file:
    csv_reader = csv.reader(file)
    header = next(csv_reader)
    print("Header:", header)
    same=0
    different=0
    for idx, label in enumerate(predicted_labels):
        r = next(csv_reader)
        if int(r[1])==label:
            same+=1
        else:
            different+=1
    print(f"match percent: {same/(same+different)}")

Header: ['ID', 'Labels']
match percent: 0.7755
