# Activation Maps Probing
To reveal how functions modulated weight distributions, quantitative metrics (top-1/top-5 accuracy for classification, average precision for localisation) were complemented by qualitative analysis of attention maps.

## 0. Set-up

In [None]:
pip install sparsemax

Collecting sparsemax
  Downloading sparsemax-0.1.9-py2.py3-none-any.whl.metadata (3.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->sparsemax)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->sparsemax)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->sparsemax)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->sparsemax)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->sparsemax)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->sparsemax)
  Downloading nvidia_cufft_cu12

In [None]:
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sparsemax import Sparsemax
import pandas as pd
import seaborn as sns
import pandas as pd
from scipy.stats import gmean
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda")

## 1. Dataset Preparation

Data loader for CIFAR-10 with resizing to 224x224 to match pretrained ResNet input expectations:

In [None]:
# Data Loading
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_set = CIFAR10(root='./data', train=True, download=True, transform=transform)
test_set = CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

## 2. Building Attention Enhanced Convolutional Neural Network (CNN) models

In [None]:
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        padding = 3 if kernel_size == 7 else 1
        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)

In [None]:
# Channel Attention with configurable activation
class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16, activation_type='sigmoid'):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.fc1 = nn.Conv2d(in_planes, in_planes//ratio, 1, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Conv2d(in_planes//ratio, in_planes, 1, bias=False)
        self.activation_type = activation_type

        # Activation parameters
        self.scale = nn.Parameter(torch.ones(1)) if activation_type == 'scaled_tanh' else None
        self.temp = nn.Parameter(torch.ones(1)) if activation_type == 'parametric_sigmoid' else None
        self.sparsemax = Sparsemax(dim=1) if activation_type == 'sparsemax' else None

        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        avg_out = self.fc2(self.relu(self.fc1(self.avg_pool(x))))
        max_out = self.fc2(self.relu(self.fc1(self.max_pool(x))))
        out = avg_out + max_out

        if self.activation_type == 'softmax':
            return self.softmax(out.view(out.size(0), -1)).view_as(out)
        elif self.activation_type == 'sparsemax':
            return self.sparsemax(out.view(out.size(0), -1)).view_as(out)
        elif self.activation_type == 'scaled_tanh':
            return (torch.tanh(self.scale * out) + 1) / 2
        elif self.activation_type == 'parametric_sigmoid':
            return self.sigmoid(out / self.temp)
        elif self.activation_type == 'swish':
            return out * torch.sigmoid(out)
        else:  # sigmoid
            return self.sigmoid(out)

In [None]:
class CBAM(nn.Module):
    def __init__(self, planes, ratio=16, channel_activation='sigmoid'):
        super(CBAM, self).__init__()
        self.ca = ChannelAttention(planes, ratio, channel_activation)
        self.sa = SpatialAttention()

    def forward(self, x):
        x = x * self.ca(x)
        x = x * self.sa(x)
        return x

## 3. Model Building


---



### ResNet with CBAM:

- Custom ResNet18 wrapper that conditionally adds CBAM at different depths

In [None]:
class ResNet18_CBAM(nn.Module):
    def __init__(self, attention_position="middle", num_classes=10, channel_activation='sigmoid'):
        super(ResNet18_CBAM, self).__init__()
        base = models.resnet18(pretrained=True)
        self.stem = nn.Sequential(base.conv1, base.bn1, base.relu, base.maxpool)
        self.layer1 = base.layer1
        self.layer2 = base.layer2
        self.layer3 = base.layer3
        self.layer4 = base.layer4
        self.avgpool = base.avgpool
        self.fc = nn.Linear(512, num_classes)

        # Add CBAM modules
        self.attention_position = attention_position
        if attention_position == "early":
            self.cbam1 = CBAM(64, channel_activation=channel_activation)
        elif attention_position == "middle":
            self.cbam2 = CBAM(128, channel_activation=channel_activation)
        elif attention_position == "late":
            self.cbam3 = CBAM(512, channel_activation=channel_activation)

    def forward(self, x):
        x = self.stem(x)

        if self.attention_position == "early":
            x = self.layer1(x)
            x = self.cbam1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)
        elif self.attention_position == "middle":
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.cbam2(x)
            x = self.layer3(x)
            x = self.layer4(x)
        elif self.attention_position == "late":
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)
            x = self.cbam3(x)
        else:  # no attention
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return self.fc(x)

# Evaluation Metric

* Entropy: Quantify the uncertainty and spread of attention. Lower entropy indicates sharper focus, linked to interpretability in prior work.

* Mean Attention Value: Reflect global activation strength.

* Focus Area Ratio: Measure spatial concentration of high-attention regions, pixels exceeding 50% of maximum attention, inspired by saliency detection.

* Gini Coefficient: Evaluate inequality in attention weight distribution, where values closer to 1 indicate extreme concentration and values near 0 reflect uniformity.

In [None]:
def gini_coefficient(x):
    """Compute Gini coefficient (measure of sparsity)."""
    x = np.sort(x)
    n = len(x)
    cumx = np.cumsum(x)
    return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n

def compute_metrics(att_weights, test_acc):
    """Calculate attention distribution metrics."""
    return {
        'activation': 'sigmoid',  # Placeholder, will be overwritten
        'avg_score': np.mean(att_weights),
        'focus_%': 100 * np.mean(att_weights > np.mean(att_weights)),
        'sparsity': 1 - np.count_nonzero(att_weights) / len(att_weights),
        'gini': gini_coefficient(att_weights),
        'test_acc': test_acc
    }

def plot_kde_comparison(all_weights):
    """Plot KDE for all activation types."""
    plt.figure(figsize=(12, 6))
    for act, weights in all_weights.items():
        sns.kdeplot(weights, fill=True, label=act)
    plt.title("Attention Weight Distributions")
    plt.xlabel("Attention Score")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.show()

def plot_radar_chart(metrics_df):
    """Radar plot comparing metrics across activations."""
    categories = ['avg_score', 'focus_%', 'sparsity', 'gini', 'test_acc']
    N = len(categories)
    angles = [n / N * 2 * np.pi for n in range(N)]
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'polar': True})
    for _, row in metrics_df.iterrows():
        values = row[categories].tolist()
        values += values[:1]
        ax.plot(angles, values, label=row['activation'])
        ax.fill(angles, values, alpha=0.1)

    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories)
    ax.set_title("Activation Mechanism Comparison", size=16)
    plt.legend(loc='upper right')
    plt.show()

## 4. Training and Evaluation

Includes Top-1, Top-5, Average Precision and gradient flow plot & logging alpha values

In [None]:
# Training and Evaluation Functions
def train_model(model, train_loader, test_loader, num_epochs=10, lr=0.001):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_loss = running_loss/len(train_loader)
        train_acc = 100.*correct/total

        # Validation
        test_acc, test_loss = evaluate_model(model, test_loader, criterion)

        print(f"Epoch {epoch+1}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | "
              f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")

        if test_acc > best_acc:
            best_acc = test_acc
            best_model = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model)
    return model

In [None]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    return 100.*correct/total, running_loss/len(test_loader)

In [None]:
# Attention Visualization
def visualize_attention(model, test_loader, channel_activation):
    model.eval()
    images, _ = next(iter(test_loader))
    img = images[0].unsqueeze(0).to(device)

    attention_weights = []
    def hook_fn(module, input, output):
        attention_weights.append(output.detach().cpu().squeeze())  # Shape: [num_channels]

    # Register hook
    if model.attention_position == "early":
        handle = model.cbam1.ca.register_forward_hook(hook_fn)
    elif model.attention_position == "middle":
        handle = model.cbam2.ca.register_forward_hook(hook_fn)
    elif model.attention_position == "late":
        handle = model.cbam3.ca.register_forward_hook(hook_fn)

    with torch.no_grad():
        _ = model(img)
    handle.remove()

    return attention_weights[0].numpy()  # Return as numpy array

In [None]:
# Define all channel activation types to test
channel_activation_types = ['sigmoid', 'softmax', 'sparsemax', 'scaled_tanh','parametric_sigmoid','swish']

# Dictionary to store attention weights and metrics
all_weights = {}
results = []

for channel_act in channel_activation_types:
    print(f"\n{'='*40}")
    print(f"Training with Channel Activation: {channel_act.upper()}")
    print(f"{'='*40}")

    # Initialize and train model
    model = ResNet18_CBAM(attention_position="late", channel_activation=channel_act).to(device)
    trained_model = train_model(model, train_loader, test_loader, num_epochs=5, lr=0.001)

    # Get attention weights and test accuracy
    att_weights = visualize_attention(trained_model, test_loader, channel_act)
    test_acc = evaluate_model(trained_model, test_loader, nn.CrossEntropyLoss())[0]

    # Store results
    all_weights[channel_act] = att_weights
    metrics = compute_metrics(att_weights, test_acc)
    metrics['activation'] = channel_act
    results.append(metrics)

    # Cleanup
    del model
    torch.cuda.empty_cache()

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\n=== Metrics Summary ===")
print(results_df.to_markdown(index=False))

Using device: cuda

Training with Channel Activation: SIGMOID


Epoch 1: 100%|██████████| 782/782 [01:29<00:00,  8.77it/s]


Epoch 1: Train Loss: 0.5564, Train Acc: 80.99% | Test Loss: 0.4785, Test Acc: 83.26%


Epoch 2: 100%|██████████| 782/782 [01:29<00:00,  8.70it/s]


Epoch 2: Train Loss: 0.3190, Train Acc: 89.00% | Test Loss: 0.3170, Test Acc: 89.49%


Epoch 3: 100%|██████████| 782/782 [01:30<00:00,  8.69it/s]


Epoch 3: Train Loss: 0.2193, Train Acc: 92.47% | Test Loss: 0.3117, Test Acc: 89.53%


Epoch 4: 100%|██████████| 782/782 [01:29<00:00,  8.75it/s]


Epoch 4: Train Loss: 0.1511, Train Acc: 94.77% | Test Loss: 0.3264, Test Acc: 89.72%


Epoch 5: 100%|██████████| 782/782 [01:29<00:00,  8.72it/s]


Epoch 5: Train Loss: 0.1177, Train Acc: 95.92% | Test Loss: 0.3258, Test Acc: 89.91%

Training with Channel Activation: SOFTMAX


Epoch 1: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 1: Train Loss: 1.5943, Train Acc: 42.15% | Test Loss: 1.3098, Test Acc: 49.13%


Epoch 2: 100%|██████████| 782/782 [01:29<00:00,  8.72it/s]


Epoch 2: Train Loss: 1.2403, Train Acc: 52.56% | Test Loss: 1.3498, Test Acc: 52.98%


Epoch 3: 100%|██████████| 782/782 [01:29<00:00,  8.71it/s]


Epoch 3: Train Loss: 0.9842, Train Acc: 64.32% | Test Loss: 0.8150, Test Acc: 71.80%


Epoch 4: 100%|██████████| 782/782 [01:29<00:00,  8.75it/s]


Epoch 4: Train Loss: 0.7216, Train Acc: 76.40% | Test Loss: 0.6294, Test Acc: 80.63%


Epoch 5: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 5: Train Loss: 0.5547, Train Acc: 82.80% | Test Loss: 0.5261, Test Acc: 83.48%

Training with Channel Activation: SPARSEMAX


Epoch 1: 100%|██████████| 782/782 [01:31<00:00,  8.51it/s]


Epoch 1: Train Loss: 1.8539, Train Acc: 28.05% | Test Loss: 1.6437, Test Acc: 33.41%


Epoch 2: 100%|██████████| 782/782 [01:31<00:00,  8.52it/s]


Epoch 2: Train Loss: 1.6634, Train Acc: 32.97% | Test Loss: 1.6093, Test Acc: 35.39%


Epoch 3: 100%|██████████| 782/782 [01:31<00:00,  8.55it/s]


Epoch 3: Train Loss: 1.5410, Train Acc: 39.03% | Test Loss: 1.4755, Test Acc: 41.66%


Epoch 4: 100%|██████████| 782/782 [01:31<00:00,  8.53it/s]


Epoch 4: Train Loss: 1.4543, Train Acc: 42.40% | Test Loss: 1.4041, Test Acc: 43.40%


Epoch 5: 100%|██████████| 782/782 [01:31<00:00,  8.51it/s]


Epoch 5: Train Loss: 1.3814, Train Acc: 43.93% | Test Loss: 1.3108, Test Acc: 46.48%

Training with Channel Activation: SCALED_TANH


Epoch 1: 100%|██████████| 782/782 [01:29<00:00,  8.72it/s]


Epoch 1: Train Loss: 0.5649, Train Acc: 80.82% | Test Loss: 0.4817, Test Acc: 83.42%


Epoch 2: 100%|██████████| 782/782 [01:29<00:00,  8.71it/s]


Epoch 2: Train Loss: 0.3181, Train Acc: 89.07% | Test Loss: 0.3841, Test Acc: 87.44%


Epoch 3: 100%|██████████| 782/782 [01:29<00:00,  8.73it/s]


Epoch 3: Train Loss: 0.2219, Train Acc: 92.25% | Test Loss: 0.3726, Test Acc: 88.17%


Epoch 4: 100%|██████████| 782/782 [01:29<00:00,  8.72it/s]


Epoch 4: Train Loss: 0.1594, Train Acc: 94.49% | Test Loss: 0.3447, Test Acc: 89.62%


Epoch 5: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 5: Train Loss: 0.1148, Train Acc: 95.94% | Test Loss: 0.3499, Test Acc: 89.19%

Training with Channel Activation: PARAMETRIC_SIGMOID


Epoch 1: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 1: Train Loss: 0.5686, Train Acc: 80.79% | Test Loss: 0.4698, Test Acc: 83.88%


Epoch 2: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 2: Train Loss: 0.3159, Train Acc: 89.37% | Test Loss: 0.3845, Test Acc: 87.12%


Epoch 3: 100%|██████████| 782/782 [01:29<00:00,  8.76it/s]


Epoch 3: Train Loss: 0.2189, Train Acc: 92.47% | Test Loss: 0.3481, Test Acc: 88.17%


Epoch 4: 100%|██████████| 782/782 [01:29<00:00,  8.70it/s]


Epoch 4: Train Loss: 0.1536, Train Acc: 94.79% | Test Loss: 0.3236, Test Acc: 89.41%


Epoch 5: 100%|██████████| 782/782 [01:29<00:00,  8.72it/s]


Epoch 5: Train Loss: 0.1193, Train Acc: 95.91% | Test Loss: 0.3054, Test Acc: 90.47%

Training with Channel Activation: SWISH


Epoch 1: 100%|██████████| 782/782 [01:29<00:00,  8.76it/s]


Epoch 1: Train Loss: 0.6116, Train Acc: 79.39% | Test Loss: 0.4913, Test Acc: 83.47%


Epoch 2: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 2: Train Loss: 0.3428, Train Acc: 88.30% | Test Loss: 0.3905, Test Acc: 86.81%


Epoch 3: 100%|██████████| 782/782 [01:29<00:00,  8.75it/s]


Epoch 3: Train Loss: 0.2378, Train Acc: 91.91% | Test Loss: 0.3628, Test Acc: 88.31%


Epoch 4: 100%|██████████| 782/782 [01:29<00:00,  8.74it/s]


Epoch 4: Train Loss: 0.1723, Train Acc: 94.13% | Test Loss: 0.4249, Test Acc: 87.55%


Epoch 5: 100%|██████████| 782/782 [01:29<00:00,  8.70it/s]


Epoch 5: Train Loss: 0.1368, Train Acc: 95.31% | Test Loss: 0.3295, Test Acc: 89.54%

=== Metrics Summary ===
| activation         |   avg_score |   focus_% |   sparsity |     gini |   test_acc |
|:-------------------|------------:|----------:|-----------:|---------:|-----------:|
| sigmoid            |  0.645785   | 64.4531   |  0         | 0.354157 |      89.91 |
| softmax            |  0.00195312 |  0.585938 |  0         | 0.996395 |      83.48 |
| sparsemax          |  0.00195312 |  0.195312 |  0.998047  | 0.998047 |      46.48 |
| scaled_tanh        |  0.619784   | 61.7188   |  0.0878906 | 0.379701 |      89.62 |
| parametric_sigmoid |  0.610935   | 60.7422   |  0         | 0.388816 |      90.47 |
| swish              |  1.27774    | 25.1953   |  0         | 0.896633 |      89.54 |
