In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Data Preparation ---
# Define transformations for training and testing
# Data augmentation for training to improve generalization
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])

# Load CIFAR-10 datasets
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# Create DataLoaders
batch_size = 128
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# --- 2. Define Models with Attention Mechanism ---

class SelfAttention(nn.Module):
    def __init__(self, in_channels):
        super(SelfAttention, self).__init__()
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, kernel_size=1)
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(in_channels, in_channels, kernel_size=1)
        self.gamma = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        batch_size, C, H, W = x.size()
        
        # Reshape to (batch_size, channels, H*W) and permute for matrix multiplication
        proj_query = self.query_conv(x).view(batch_size, -1, H*W).permute(0, 2, 1) # B, HW, C'
        proj_key = self.key_conv(x).view(batch_size, -1, H*W) # B, C', HW
        
        energy = torch.bmm(proj_query, proj_key) # B, HW, HW (attention map)
        attention = torch.softmax(energy, dim=-1)
        
        proj_value = self.value_conv(x).view(batch_size, -1, H*W) # B, C, HW

        out = torch.bmm(proj_value, attention.permute(0, 2, 1)) # B, C, HW
        out = out.view(batch_size, C, H, W)
        
        out = self.gamma * out + x
        return out


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, use_attention=False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )
        
        self.use_attention = use_attention
        if use_attention:
            self.attention = SelfAttention(self.expansion * planes)

    def forward(self, x):
        out = torch.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = torch.relu(out)
        if self.use_attention:
            out = self.attention(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10, use_attention=False):
        super(ResNet, self).__init__()
        self.in_planes = 64
        self.use_attention = use_attention

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride, self.use_attention))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = torch.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = nn.functional.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18_Attention():
    return ResNet(BasicBlock, [2, 2, 2, 2], use_attention=True)

def ResNet18_NoAttention():
    return ResNet(BasicBlock, [2, 2, 2, 2], use_attention=False)

# --- 3. Ensemble Learning ---
class EnsembleModel(nn.Module):
    def __init__(self, models):
        super(EnsembleModel, self).__init__()
        self.models = nn.ModuleList(models)
    
    def forward(self, x):
        # Collect predictions from each model
        outputs = [model(x) for model in self.models]
        # Average the predictions (soft voting)
        avg_output = torch.mean(torch.stack(outputs, dim=0), dim=0)
        return avg_output

# --- 4. Training Function ---
def train_model(model, dataloader, optimizer, criterion, epoch, model_name="Model"):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    avg_loss = running_loss / len(dataloader)
    accuracy = 100. * correct / total
    print(f'Epoch {epoch} | {model_name} Loss: {avg_loss:.4f} | Acc: {accuracy:.2f}%')
    return avg_loss, accuracy

# --- 5. Evaluation Function ---
def evaluate_model(model, dataloader, criterion, model_name="Model"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    avg_loss = running_loss / len(dataloader)
    accuracy = 100. * correct / total
    precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average='macro', zero_division=0)
    
    print(f'\n{model_name} Test Loss: {avg_loss:.4f} | Test Acc: {accuracy:.2f}%')
    print(f'{model_name} Test Precision: {precision:.4f} | Test Recall: {recall:.4f} | Test F1-Score: {f1:.4f}\n')
    return accuracy, precision, recall, f1

# --- Main Training and Evaluation Loop ---
if __name__ == "__main__":
    num_epochs = 20 # You might want to increase this for better performance

    # Initialize individual models
    print("Initializing individual models...")
    model1 = ResNet18_NoAttention().to(device)
    model2 = ResNet18_Attention().to(device)
    model3 = ResNet18_NoAttention().to(device) # Another instance for diversity
    model4 = ResNet18_Attention().to(device)  # Another instance with attention

    individual_models = [model1, model2, model3, model4]
    model_names = ["ResNet18_NoAttention_1", "ResNet18_Attention_1", "ResNet18_NoAttention_2", "ResNet18_Attention_2"]

    criterions = [nn.CrossEntropyLoss() for _ in individual_models]
    optimizers = [optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) for model in individual_models]
    schedulers = [optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs) for optimizer in optimizers]

    # Train individual models
    print("\n--- Training Individual Models ---")
    for i, model in enumerate(individual_models):
        print(f"\nTraining {model_names[i]}:")
        best_acc = 0
        for epoch in range(num_epochs):
            train_loss, train_acc = train_model(model, trainloader, optimizers[i], criterions[i], epoch + 1, model_names[i])
            schedulers[i].step() # Update learning rate

            # Save best model
            if train_acc > best_acc:
                best_acc = train_acc
                torch.save(model.state_dict(), f'{model_names[i]}_best.pth')
                print(f"  Saved best model for {model_names[i]} with accuracy: {best_acc:.2f}%")

    # Load best weights for individual models
    print("\nLoading best weights for individual models...")
    for i, model in enumerate(individual_models):
        model.load_state_dict(torch.load(f'{model_names[i]}_best.pth'))
        model.eval() # Set to evaluation mode

    # Create and evaluate the ensemble model
    print("\n--- Evaluating Ensemble Model ---")
    ensemble_model = EnsembleModel(individual_models).to(device)
    ensemble_criterion = nn.CrossEntropyLoss() # Only needed for evaluation loss calculation
    
    # Evaluate individual models on the test set
    print("\n--- Individual Model Test Performance ---")
    for i, model in enumerate(individual_models):
        print(f"Evaluating {model_names[i]}:")
        evaluate_model(model, testloader, ensemble_criterion, model_names[i])

    # Evaluate the ensemble model
    print("\n--- Ensemble Model Test Performance ---")
    ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1 = \
        evaluate_model(ensemble_model, testloader, ensemble_criterion, "Ensemble_Model")
    
    print("\n--- Final Results ---")
    print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}%")
    print(f"Ensemble Model Precision: {ensemble_precision:.4f}")
    print(f"Ensemble Model Recall: {ensemble_recall:.4f}")
    print(f"Ensemble Model F1-Score: {ensemble_f1:.4f}")

    # You can also visualize some predictions if desired
    def visualize_predictions(model, test_loader, classes, num_images=5):
        model.eval()
        dataiter = iter(test_loader)
        images, labels = next(dataiter)

        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        print('\n--- Visualizing Predictions ---')
        plt.figure(figsize=(10, 2))
        for i in range(num_images):
            plt.subplot(1, num_images, i + 1)
            img = images[i].cpu().permute(1, 2, 0).numpy()
            img = img / 2 + 0.5  # Unnormalize for display
            plt.imshow(img.clip(0, 1)) # Clip to ensure valid RGB values
            plt.title(f'True: {classes[labels[i]]}\nPred: {classes[predicted[i]]}')
            plt.axis('off')
        plt.show()

    # visualize_predictions(ensemble_model, testloader, classes, num_images=5)

Using device: cuda


100%|██████████| 170M/170M [00:02<00:00, 61.3MB/s] 


Initializing individual models...

--- Training Individual Models ---

Training ResNet18_NoAttention_1:
Epoch 1 | ResNet18_NoAttention_1 Loss: 1.4366 | Acc: 47.67%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 47.67%
Epoch 2 | ResNet18_NoAttention_1 Loss: 0.9369 | Acc: 66.85%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 66.85%
Epoch 3 | ResNet18_NoAttention_1 Loss: 0.7168 | Acc: 74.83%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 74.83%
Epoch 4 | ResNet18_NoAttention_1 Loss: 0.5968 | Acc: 79.29%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 79.29%
Epoch 5 | ResNet18_NoAttention_1 Loss: 0.5163 | Acc: 82.12%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 82.12%
Epoch 6 | ResNet18_NoAttention_1 Loss: 0.4472 | Acc: 84.56%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 84.56%
Epoch 7 | ResNet18_NoAttention_1 Loss: 0.4026 | Acc: 86.01%
  Saved best model for ResNet18_NoAttention_1 with accuracy: 86.01%
