# Imports and Installation

In [None]:
!pip install timm

import torch
import timm
from torchvision import transforms
from PIL import Image, ImageFilter, ImageEnhance
import numpy as np
import os
from tqdm import tqdm
import pandas as pd

# Device Specification and Model Loading

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_names = [
    'resnet50',
    'resnet101',
    'densenet121',
    'efficientnet_b3',
    'mobilenetv3_large_100',
    'convnext_base',
    'regnety_032',
    'vit_base_patch16_224',
    'swin_base_patch4_window7_224',
    'deit_base_patch16_224',
    'resnext50_32x4d',
    'inception_v3'
]

models = {}
for name in model_names:
    print(f'Loading {name}')
    model = timm.create_model(name, pretrained=True).eval().to(device)
    models[name] = model
print("Loaded Models:", list(models.keys()))

# Defining Transforms

In [None]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
def perturb_clean(img):
    return img

def perturb_hflip(img):
    return img.transpose(Image.FLIP_LEFT_RIGHT)

def perturb_rotation(img, angle=15):
    return img.rotate(angle)

def perturb_blur(img, radius=2):
    return img.filter(ImageFilter.GaussianBlur(radius))

def perturb_brightness(img, factor=1.5):  
    enhancer = ImageEnhance.Brightness(img)
    return enhancer.enhance(factor)

def perturb_gaussian_noise(img, sigma=0.1):
    arr = np.array(img).astype(np.float32) / 255.
    noise = np.random.normal(0, sigma, arr.shape)
    arr = np.clip(arr + noise, 0, 1)
    return Image.fromarray((arr * 255).astype(np.uint8))

In [None]:
perturbations = {
    "Clean": perturb_clean,
    "Horizontal Flip": perturb_hflip,
    "Rotation": lambda img: perturb_rotation(img, 15),
    "Blur": lambda img: perturb_blur(img, 2),
    "Brightness": lambda img: perturb_brightness(img, 1.5),
    "Gaussian Noise": lambda img: perturb_gaussian_noise(img, 0.1),
}

In [None]:
from glob import glob
import os

val_dir = '/kaggle/input/imagenet100/val.X'

val_images = []
class_names = sorted(os.listdir(val_dir))
label_map = {cls: idx for idx, cls in enumerate(class_names)}  

for cls in class_names:
    class_folder = os.path.join(val_dir, cls)
    img_paths = glob(os.path.join(class_folder, '*.jpg')) + glob(os.path.join(class_folder, '*.JPEG'))
    for p in img_paths:
        val_images.append((p, label_map[cls]))

print("Total images loaded:", len(val_images))
print("Sample:", val_images[:5])

In [None]:
def predict(model, img):
    x = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(x)
        pred = torch.argmax(logits, 1).item()
    return pred

results = {model_name: {p: [] for p in perturbations} for model_name in models}

sample_size = 1000  

for model_name, model in models.items():
    print(f'==> Evaluating: {model_name}')
    for pert_name, pert_fn in perturbations.items():
        correct = 0
        total = 0
        for img_path, label in tqdm(val_images[:sample_size], desc=f'{pert_name}', leave=False):
            img = Image.open(img_path).convert('RGB')
            img = pert_fn(img)
            pred = predict(model, img)
            correct += (pred == label)
            total += 1
        acc = correct / total * 100
        results[model_name][pert_name] = acc
        print(f'{model_name} | {pert_name} | Acc: {acc:.2f}%')

In [None]:
df = pd.DataFrame(results).T
display(df)

In [None]:
!pip install grad-cam

import torch
import timm
import numpy as np
from torchvision import transforms
from PIL import Image
from matplotlib import pyplot as plt
from pytorch_grad_cam import GradCAM, LayerCAM, ScoreCAM, EigenCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image, preprocess_image

In [None]:
model = models['resnet50']
model.eval()
target_layer = model.layer4[-1]

img_path = '/kaggle/input/imagenet100/val.X/n01440764/ILSVRC2012_val_00000293.JPEG'
img = Image.open(img_path).convert('RGB')
input_tensor = preprocess(img).unsqueeze(0).to(device)  

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()

cam = GradCAM(model=model, target_layers=[target_layer])
grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]

rgb_img = np.array(img.resize((224, 224))) / 255.0
visualization = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)

plt.imshow(visualization)
plt.title(f"Grad-CAM: ResNet50, Class {class_idx}")
plt.axis('off')
plt.show()

In [None]:
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)


model = models['resnet50']
target_layer = model.layer4[-1]

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()


cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}

overlays = {}
overlays["Original"] = rgb_img

for cam_name, cam_class in cam_methods.items():
    if cam_name == "Original":
        continue
    cam = cam_class(model=model, target_layers=[target_layer])
    grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
    cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
    overlays[cam_name] = cam_overlay


n_methods = len(overlays)
plt.figure(figsize=(4 * n_methods, 4))
for i, (title, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i + 1)
    plt.imshow(img)
    plt.title(title)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for resnet50" )

In [None]:
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)


model = models['resnet101']
target_layer = model.layer4[-1]

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()


cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}

overlays = {}
overlays["Original"] = rgb_img

for cam_name, cam_class in cam_methods.items():
    if cam_name == "Original":
        continue
    cam = cam_class(model=model, target_layers=[target_layer])
    grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
    cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
    overlays[cam_name] = cam_overlay


n_methods = len(overlays)
plt.figure(figsize=(4 * n_methods, 4))
for i, (title, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i + 1)
    plt.imshow(img)
    plt.title(title)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for resnet101" )

In [None]:
model = models['densenet121']          
target_layer = model.features[-1]             
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'  

img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()

cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlays = {"Original": rgb_img}
for cam_name, cam_class in list(cam_methods.items())[1:]:  
    try:
        cam = cam_class(model=model, target_layers=[target_layer])
        grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
        cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
        overlays[cam_name] = cam_overlay
    except Exception as e:
        print(f"Error with {cam_name}: {e}")
        overlays[cam_name] = np.ones_like(rgb_img)  # blank white


n_methods = len(overlays)
plt.figure(figsize=(4*n_methods, 4))
for i, (name, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i+1)
    plt.imshow(img)
    plt.title(name)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for densenet121" )

In [None]:
model = models['efficientnet_b3']          
target_layer = model.blocks[-1]             
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'  

img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()

cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlays = {"Original": rgb_img}
for cam_name, cam_class in list(cam_methods.items())[1:]:  
    try:
        cam = cam_class(model=model, target_layers=[target_layer])
        grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
        cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
        overlays[cam_name] = cam_overlay
    except Exception as e:
        print(f"Error with {cam_name}: {e}")
        overlays[cam_name] = np.ones_like(rgb_img)  

n_methods = len(overlays)
plt.figure(figsize=(4*n_methods, 4))
for i, (name, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i+1)
    plt.imshow(img)
    plt.title(name)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for efficientnet_b3" )

In [None]:
model = models['mobilenetv3_large_100']          
target_layer = model.blocks[-1]             
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'  

img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()

cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlays = {"Original": rgb_img}
for cam_name, cam_class in list(cam_methods.items())[1:]:  
    try:
        cam = cam_class(model=model, target_layers=[target_layer])
        grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
        cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
        overlays[cam_name] = cam_overlay
    except Exception as e:
        print(f"Error with {cam_name}: {e}")
        overlays[cam_name] = np.ones_like(rgb_img)  

n_methods = len(overlays)
plt.figure(figsize=(4*n_methods, 4))
for i, (name, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i+1)
    plt.imshow(img)
    plt.title(name)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for 'mobilenetv3_large_100" )

In [None]:
model = models['convnext_base']          
target_layer = model.stages[-1].blocks[-1]             
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'  

img = Image.open(img_path).convert('RGB')
img_resized = img.resize((224, 224))
rgb_img = np.array(img_resized) / 255.0
input_tensor = preprocess(img_resized).unsqueeze(0).to(device)

with torch.no_grad():
    logits = model(input_tensor)
    class_idx = logits.argmax(1).item()

cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlays = {"Original": rgb_img}
for cam_name, cam_class in list(cam_methods.items())[1:]:  
    try:
        cam = cam_class(model=model, target_layers=[target_layer])
        grayscale_cam = cam(input_tensor=input_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
        cam_overlay = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
        overlays[cam_name] = cam_overlay
    except Exception as e:
        print(f"Error with {cam_name}: {e}")
        overlays[cam_name] = np.ones_like(rgb_img)  


n_methods = len(overlays)
plt.figure(figsize=(4*n_methods, 4))
for i, (name, img) in enumerate(overlays.items()):
    plt.subplot(1, n_methods, i+1)
    plt.imshow(img)
    plt.title(name)
    plt.axis('off')
plt.tight_layout()
plt.show()
print("Overlays for convnext_base" )

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = timm.create_model("vit_base_patch16_224", pretrained=True).eval().to(device)
target_layer = model.blocks[-1].norm1          


def vit_reshape(tensor, h=14, w=14):
    tensor = tensor[:, 1:, :]                  
    B, N, C = tensor.shape
    return tensor.transpose(1, 2).reshape(B, C, h, w)


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG" 
img = Image.open(img_path).convert("RGB")
rgb_img = np.array(img.resize((224, 224))) / 255.0

preproc = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])
input_tensor = preproc(img).unsqueeze(0).to(device)

with torch.no_grad():
    class_idx = model(input_tensor).argmax(1).item()


cam_methods = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,  
    "EigenCAM": EigenCAM    
}

overlays = {"Original": rgb_img}
for name, cam_cls in list(cam_methods.items())[1:]:
    try:
        cam = cam_cls(
            model=model,
            target_layers=[target_layer],
            reshape_transform=vit_reshape          
        )
        grayscale = cam(input_tensor=input_tensor,
                        targets=[ClassifierOutputTarget(class_idx)])[0]
        overlays[name] = show_cam_on_image(rgb_img, grayscale, use_rgb=True)
    except Exception as e:
        print(f"{name} failed: {e}")
        overlays[name] = np.ones_like(rgb_img)     

n = len(overlays)
plt.figure(figsize=(4*n, 4))
for i, (title, img_) in enumerate(overlays.items(), 1):
    plt.subplot(1, n, i)
    plt.imshow(img_)
    plt.title(title, fontsize=12)
    plt.axis("off")
plt.tight_layout()
plt.show()
print("Overlays for vit_base_patch16_224" )

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = timm.create_model("swin_base_patch4_window7_224", pretrained=True).eval().to(device)
target_layer = model.layers[-1].blocks[-1].norm1     


def swin_reshape(t):
    if t.ndim == 3:                       
        B, L, C = t.shape
        H = W = int(L ** 0.5)             
        return t.transpose(1,2).reshape(B, C, H, W)
    elif t.ndim == 4:                     
        return t.permute(0, 3, 1, 2)      
    else:
        raise ValueError("Unexpected tensor shape", t.shape)


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert("RGB")
rgb = np.array(img.resize((224, 224))) / 255.0
prep = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
])
tensor = prep(img).unsqueeze(0).to(device)

with torch.no_grad():
    cls_idx = model(tensor).argmax(1).item()
print("pred class:", cls_idx)


cams = {
    "Original": None,
    "Grad-CAM":  GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,    
    "EigenCAM": EigenCAM     
}

overlay = {"Original": rgb}
for name, cam_cls in list(cams.items())[1:]:
    try:
        cam = cam_cls(
            model=model,
            target_layers=[target_layer],
            reshape_transform=swin_reshape
        )
        gray = cam(tensor, targets=[ClassifierOutputTarget(cls_idx)])[0]
        overlay[name] = show_cam_on_image(rgb, gray, use_rgb=True)
    except Exception as e:
        print(f"{name} failed: {e}")
        overlay[name] = np.ones_like(rgb)      


plt.figure(figsize=(4*len(overlay), 4))
for i, (k, v) in enumerate(overlay.items(), 1):
    plt.subplot(1, len(overlay), i)
    plt.imshow(v); plt.title(k); plt.axis("off")
plt.tight_layout(); plt.show()
print("Overlays for swin_base_patch4_window7_224" )

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = timm.create_model("deit_base_patch16_224", pretrained=True).eval().to(device)
target_layer = model.blocks[-1].norm1         


def vit_reshape(t, h=14, w=14):
    t = t[:, 1:, :]                
    B, N, C = t.shape
    return t.transpose(1, 2).reshape(B, C, h, w)


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert("RGB")
rgb = np.array(img.resize((224,224))) / 255.0
prep = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])
tensor = prep(img).unsqueeze(0).to(device)

with torch.no_grad():
    cls_idx = model(tensor).argmax(1).item()


cams = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlay = {"Original": rgb}
for name, cam_cls in list(cams.items())[1:]:
    try:
        cam = cam_cls(model, [target_layer], reshape_transform=vit_reshape)
        gray = cam(tensor, targets=[ClassifierOutputTarget(cls_idx)])[0]
        overlay[name] = show_cam_on_image(rgb, gray, use_rgb=True)
    except Exception as e:
        print(f"{name} failed: {e}")
        overlay[name] = np.ones_like(rgb)


plt.figure(figsize=(4*len(overlay),4))
for i,(k,v) in enumerate(overlay.items(),1):
    plt.subplot(1,len(overlay),i); plt.imshow(v); plt.title(k); plt.axis("off")
plt.tight_layout(); plt.show()
print("Overlays for deit_base_patch16_224" )

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = timm.create_model("resnext50_32x4d", pretrained=True).eval().to(device)
target_layer = model.layer4[-1]                


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert("RGB")
rgb = np.array(img.resize((224, 224))) / 255.0
prep = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
])
tensor = prep(img).unsqueeze(0).to(device)
with torch.no_grad():
    cls_idx = model(tensor).argmax(1).item()

cams = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlay = {"Original": rgb}
for name, cam_cls in list(cams.items())[1:]:
    try:
        cam = cam_cls(model=model, target_layers=[target_layer])
        gray = cam(tensor, targets=[ClassifierOutputTarget(cls_idx)])[0]
        overlay[name] = show_cam_on_image(rgb, gray, use_rgb=True)
    except Exception as e:
        print(f"{name} failed: {e}")
        overlay[name] = np.ones_like(rgb)


plt.figure(figsize=(4*len(overlay), 4))
for i, (k, v) in enumerate(overlay.items(), 1):
    plt.subplot(1, len(overlay), i)
    plt.imshow(v); plt.title(k); plt.axis("off")
plt.tight_layout(); plt.show()
print("Overlays for resnext50_32x4d" )

In [None]:
model = timm.create_model("inception_v3", pretrained=True).eval().to(device)
target_layer = model.Mixed_7c           

img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert("RGB")
rgb = np.array(img.resize((299, 299))) / 255.0
prep = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
])
tensor = prep(img).unsqueeze(0).to(device)
with torch.no_grad():
    cls_idx = model(tensor).argmax(1).item()


cams = {
    "Original": None,
    "Grad-CAM": GradCAM,
    "LayerCAM": LayerCAM,
    "ScoreCAM": ScoreCAM,
    "EigenCAM": EigenCAM,
}
overlay = {"Original": rgb}
for name, cam_cls in list(cams.items())[1:]:
    try:
        cam = cam_cls(model=model, target_layers=[target_layer])
        gray = cam(tensor, targets=[ClassifierOutputTarget(cls_idx)])[0]
        overlay[name] = show_cam_on_image(rgb, gray, use_rgb=True)
    except Exception as e:
        print(f"{name} failed: {e}")
        overlay[name] = np.ones_like(rgb)

plt.figure(figsize=(4*len(overlay), 4))
for i, (k, v) in enumerate(overlay.items(), 1):
    plt.subplot(1, len(overlay), i)
    plt.imshow(v); plt.title(k); plt.axis("off")
plt.tight_layout(); plt.show()
print("Overlays for inception_v3" )

# GradCAM Overlays under Perturbations (CNNs)

In [None]:
import torch
import timm
import numpy as np
from torchvision import transforms
from PIL import Image, ImageFilter, ImageEnhance
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from scipy.spatial.distance import cosine


def horizontal_flip(img):
    return img.transpose(Image.FLIP_LEFT_RIGHT)

def rotate(img, degrees=30):
    return img.rotate(degrees)

def blur(img, radius=2):
    return img.filter(ImageFilter.GaussianBlur(radius))

def brighten(img, factor=1.5):
    return ImageEnhance.Brightness(img).enhance(factor)

def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': rotate,
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise
}


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert('RGB')

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

device = 'cuda' if torch.cuda.is_available() else 'cpu'


models_and_layers = [
    ('ResNet-50', 'resnet50',            lambda m: m.layer4[-1]),
    ('ResNet-101', 'resnet101',          lambda m: m.layer4[-1]),
    ('DenseNet-121', 'densenet121',      lambda m: m.features[-2]),
    ('EfficientNet-B3', 'efficientnet_b3', lambda m: m.blocks[-1]),
    ('MobileNetV3-L', 'mobilenetv3_large_100', lambda m: m.blocks[-1]),
    ('ConvNeXt-Base', 'convnext_base',   lambda m: m.stages[-1].blocks[-1]),
    ('ResNeXt50-32x4d', 'resnext50_32x4d', lambda m: m.layer4[-1]),
    ('Inception V3', 'inception_v3',     lambda m: m.Mixed_7c),
]

def cam_similarity(cam1, cam2):
    cam1f = cam1.flatten() / (np.linalg.norm(cam1.flatten()) + 1e-6)
    cam2f = cam2.flatten() / (np.linalg.norm(cam2.flatten()) + 1e-6)
    sim = 1 - cosine(cam1f, cam2f)
    return sim

for print_name, timm_name, layer_func in models_and_layers:
    print("\n" + "="*35)
    print(f"Evaluating Model: {print_name}")
    print("="*35)
    # --- Load model ---
    model = timm.create_model(timm_name, pretrained=True).eval().to(device)
    target_layer = layer_func(model)
    cam = GradCAM(model=model, target_layers=[target_layer])

    input_tensor_clean = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        logits_clean = model(input_tensor_clean)
        class_idx_clean = logits_clean.argmax(1).item()

    grayscale_cam_clean = cam(input_tensor=input_tensor_clean, targets=None)[0]
    cams = {'Clean': grayscale_cam_clean}
    accuracies = {}
    similarities = {}

    for name, perturb in perturbations.items():
        perturbed_img = perturb(img)
        input_tensor = preprocess(perturbed_img).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(input_tensor)
            class_idx = logits.argmax(1).item()
            # For a single image, 1 if same as clean class, else 0
            top1_acc = 1 if class_idx == class_idx_clean else 0
        grayscale_cam = cam(input_tensor=input_tensor, targets=None)[0]
        cams[name] = grayscale_cam
        accuracies[name] = top1_acc
        similarities[name] = cam_similarity(grayscale_cam_clean, grayscale_cam) if name != 'Clean' else 1.0

    print(f"{'Perturbation':<18} {'Top-1 Match':<12} {'CAM Similarity':<14}")
    for name in perturbations.keys():
        print(f"{name:<18} {accuracies[name]:<12} {similarities[name]:<14.2f}")

    import matplotlib.pyplot as plt
    plt.figure(figsize=(18, 4))
    for i, name in enumerate(perturbations.keys()):
         cam_map = cams[name]
         perturbed_img = perturbations[name](img)
         rgb_img = np.array(perturbed_img.resize((224, 224))) / 255.0
         overlay = show_cam_on_image(rgb_img, cam_map, use_rgb=True)
         plt.subplot(1, len(perturbations), i+1)
         plt.imshow(overlay)
         plt.title(name, fontsize=10)
         plt.axis('off')
    plt.tight_layout()
    plt.suptitle(f"{print_name} CAM overlays", fontsize=14)
    plt.show()


# GradCAM Overlays for Transformer Models

In [None]:
import torch, timm, numpy as np, matplotlib.pyplot as plt
from PIL import Image, ImageFilter, ImageEnhance
from torchvision import transforms
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from skimage.metrics import structural_similarity as ssim


img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img      = Image.open(img_path).convert("RGB")

def hflip(x): return x.transpose(Image.FLIP_LEFT_RIGHT)
def rot(x,d=30): return x.rotate(d)
def gblur(x,r=2): return x.filter(ImageFilter.GaussianBlur(r))
def bright(x,f=1.5): return ImageEnhance.Brightness(x).enhance(f)
def gnoise(x,sigma=0.2):
    a=np.array(x)/255.; a=np.clip(a+np.random.normal(0,sigma,a.shape),0,1)
    return Image.fromarray((a*255).astype('uint8'))

perturbs = {
    "Clean"          : (lambda x: x),
    "Horizontal Flip": hflip,
    "Rotation"       : rot,
    "Blur"           : gblur,
    "Brightness"     : bright,
    "Gaussian Noise" : gnoise,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = timm.create_model('vit_base_patch16_224', pretrained=True).eval().to(device)
t_layer = model.blocks[-1].norm1          # final block norm

def vit_reshape(t,h=14,w=14):
    t=t[:,1:,:]                 # drop CLS
    B,N,C=t.shape
    return t.transpose(1,2).reshape(B,C,h,w)

prep = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,)*3,(0.5,)*3)
])

clean_t = prep(img).unsqueeze(0).to(device)
with torch.no_grad(): gt_lbl = model(clean_t).argmax(1).item()


print("Evaluating Model: ViT-B/16")
print(f"{'Perturbation':<16} {'Top 1 Match':>5} {'CAM Similarity':>6}")


cam_engine = GradCAM(model=model, target_layers=[t_layer], reshape_transform=vit_reshape)
clean_cam  = cam_engine(clean_t, targets=[ClassifierOutputTarget(gt_lbl)])[0]
clean_gray = clean_cam / clean_cam.max()

overlays=[]
for name,func in perturbs.items():
    p_img = func(img).resize((224,224))
    tensor= prep(p_img).unsqueeze(0).to(device)

    with torch.no_grad(): pred = model(tensor).argmax(1).item()
    acc = int(pred==gt_lbl)

    cam = cam_engine(tensor, targets=[ClassifierOutputTarget(pred)])[0]
    cam_norm = cam/cam.max()

    sim = ssim(clean_gray, cam_norm, data_range=1.0)*100

    print(f"{name:<16} {acc:>5} {sim:>6.1f}")

    overlays.append(show_cam_on_image(np.array(p_img)/255., cam, use_rgb=True))

plt.figure(figsize=(18,3))
for i,(nm,ov) in enumerate(zip(perturbs,overlays)):
    ax=plt.subplot(1,len(overlays),i+1); ax.imshow(ov); ax.set_title(nm,fontsize=9); ax.axis('off')
plt.suptitle("ViT-B/16  Grad-CAM overlays",y=1.05,fontsize=13); plt.tight_layout(); plt.show()

In [None]:
model    = timm.create_model('deit_base_patch16_224', pretrained=True).eval().to(device)
t_layer  = model.blocks[-1].norm1                     # final transformer block norm

def vit_reshape(t,h=14,w=14):                         
    t = t[:,1:,:]                                     # drop CLS token
    B,N,C = t.shape
    return t.transpose(1,2).reshape(B,C,h,w)


def hflip(x): return x.transpose(Image.FLIP_LEFT_RIGHT)
def rot(x,d=30): return x.rotate(d)
def gblur(x,r=2): return x.filter(ImageFilter.GaussianBlur(r))
def bright(x,f=1.5): return ImageEnhance.Brightness(x).enhance(f)
def gnoise(x,sigma=0.2):
    a=np.array(x)/255.; a=np.clip(a+np.random.normal(0,sigma,a.shape),0,1)
    return Image.fromarray((a*255).astype('uint8'))

perturbs = {
    "Clean"          : (lambda x: x),
    "Horizontal Flip": hflip,
    "Rotation"       : rot,
    "Blur"           : gblur,
    "Brightness"     : bright,
    "Gaussian Noise" : gnoise,
}

prep = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,)*3,(0.5,)*3)
])


clean_t = prep(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
with torch.no_grad(): gt_lbl = model(clean_t).argmax(1).item()


cam_engine = GradCAM(model=model,
                     target_layers=[t_layer],
                     reshape_transform=vit_reshape)

clean_cam  = cam_engine(clean_t, targets=[ClassifierOutputTarget(gt_lbl)])[0]
clean_gray = clean_cam/clean_cam.max()

print("Evaluating Model: DeiT-B/16")
print("Top-1 Accuracy and Grad-CAM SSIM (%)")
print(f"{'Perturbation':<16} {'Acc':>5} {'Sim':>6}")

overlays=[]
for nm,func in perturbs.items():
    p_img = func(Image.open(img_path).convert("RGB")).resize((224,224))
    t_in  = prep(p_img).unsqueeze(0).to(device)

    with torch.no_grad(): pred = model(t_in).argmax(1).item()
    acc = int(pred==gt_lbl)

    cam = cam_engine(t_in, targets=[ClassifierOutputTarget(pred)])[0]
    cam_norm = cam/cam.max()
    sim = ssim(clean_gray, cam_norm, data_range=1.0)*100

    print(f"{nm:<16} {acc:>5} {sim:>6.1f}")
    overlays.append(show_cam_on_image(np.array(p_img)/255., cam, use_rgb=True))


plt.figure(figsize=(18,3))
for i,(nm,ov) in enumerate(zip(perturbs,overlays)):
    ax=plt.subplot(1,len(overlays),i+1); ax.imshow(ov); ax.set_title(nm,fontsize=9); ax.axis('off')
plt.suptitle("DeiT-B/16   Grad-CAM overlays",y=1.05,fontsize=13); plt.tight_layout(); plt.show()

In [None]:
model    = timm.create_model('swin_base_patch4_window7_224', pretrained=True).eval().to(device)
t_layer  = model.layers[-1].blocks[-1].norm1          # last Swin block norm

def swin_reshape(t):
    if t.ndim == 3:                       
        B, L, C = t.shape
        H = W = int(L ** 0.5)             
        return t.transpose(1,2).reshape(B, C, H, W)
    elif t.ndim == 4:                     
        return t.permute(0, 3, 1, 2)      
    else:
        raise ValueError("Unexpected tensor shape", t.shape)


def hflip(x): return x.transpose(Image.FLIP_LEFT_RIGHT)
def rot(x,d=30): return x.rotate(d)
def gblur(x,r=2): return x.filter(ImageFilter.GaussianBlur(r))
def bright(x,f=1.5): return ImageEnhance.Brightness(x).enhance(f)
def gnoise(x,sigma=0.2):
    a=np.array(x)/255.; a=np.clip(a+np.random.normal(0,sigma,a.shape),0,1)
    return Image.fromarray((a*255).astype('uint8'))

perturbs = {
    "Clean"          : (lambda x: x),
    "Horizontal Flip": hflip,
    "Rotation"       : rot,
    "Blur"           : gblur,
    "Brightness"     : bright,
    "Gaussian Noise" : gnoise,
}

prep = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,)*3,(0.5,)*3)
])

clean_t = prep(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
with torch.no_grad(): gt_lbl = model(clean_t).argmax(1).item()

cam_engine = GradCAM(model=model,
                     target_layers=[t_layer],
                     reshape_transform=swin_reshape)

clean_cam  = cam_engine(clean_t, targets=[ClassifierOutputTarget(gt_lbl)])[0]
clean_gray = clean_cam/clean_cam.max()

print("Evaluating Model: Swin-B")
print("Top-1 Accuracy and Grad-CAM SSIM (%)")
print(f"{'Perturbation':<16} {'Acc':>5} {'Sim':>6}")

overlays=[]
for nm,func in perturbs.items():
    p_img = func(Image.open(img_path).convert("RGB")).resize((224,224))
    t_in  = prep(p_img).unsqueeze(0).to(device)

    with torch.no_grad(): pred = model(t_in).argmax(1).item()
    acc = int(pred==gt_lbl)

    cam = cam_engine(t_in, targets=[ClassifierOutputTarget(pred)])[0]
    cam_norm = cam/cam.max()
    sim = ssim(clean_gray, cam_norm, data_range=1.0)*100

    print(f"{nm:<16} {acc:>5} {sim:>6.1f}")
    overlays.append(show_cam_on_image(np.array(p_img)/255., cam, use_rgb=True))

plt.figure(figsize=(18,3))
for i,(nm,ov) in enumerate(zip(perturbs,overlays)):
    ax=plt.subplot(1,len(overlays),i+1); ax.imshow(ov); ax.set_title(nm,fontsize=9); ax.axis('off')
plt.suptitle("Swin-B   Grad-CAM overlays",y=1.05,fontsize=13); plt.tight_layout(); plt.show()

# Transformer Specific Methods

## ViT Attention Rollout

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import timm
import torchvision.transforms as transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('vit_base_patch16_224', pretrained=True).eval().to(device)
model.eval().to(device)
img_path ='/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'  

img = Image.open(img_path).convert('RGB')
img = img.resize((224, 224))
rgb_img = np.array(img) / 255.0

preproc = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])
input_tensor = preproc(img).unsqueeze(0).to(device)

attn_weights = []

def get_attn_hook(module, input, output):
    x = input[0]  # [batch, tokens, embed_dim]
    B, N, C = x.shape
    qkv = module.qkv(x).reshape(B, N, 3, module.num_heads, C // module.num_heads)
    q, k, v = qkv.permute(2, 0, 3, 1, 4)
    attn = (q @ k.transpose(-2, -1)) * module.scale
    attn = attn.softmax(dim=-1)
    attn_weights.append(attn.detach().cpu())

hooks = []
for blk in model.blocks:
    hooks.append(blk.attn.register_forward_hook(get_attn_hook))

with torch.no_grad():
    _ = model(input_tensor)

for h in hooks:
    h.remove()

attn_mat = torch.stack(attn_weights)      
attn_mat = attn_mat.squeeze(1).mean(1)    

num_tokens = attn_mat.shape[-1]
result = torch.eye(num_tokens)
for i in range(attn_mat.shape[0]):
    attn = attn_mat[i] + torch.eye(num_tokens)
    attn = attn / attn.sum(dim=-1, keepdim=True)
    result = attn @ result

mask = result[0, 1:]   # Exclude CLS token
mask_length = mask.shape[0]
side = int(np.sqrt(mask_length))

print('Mask length:', mask_length)
print('Calculated side:', side, 'side*side:', side*side)

if side * side != mask_length:
    raise ValueError(f"Mask length {mask_length} is not a perfect square. Check patch size and model.")

mask = mask.reshape(side, side).numpy()


plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.imshow(rgb_img)
plt.title('Original')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(rgb_img, alpha=0.5)
plt.imshow(mask, cmap='jet', alpha=0.5)
plt.title('Attention Rollout')
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
from scipy.ndimage import zoom

mask_up = zoom(mask, (224/14, 224/14), order=1)

plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.imshow(rgb_img)
plt.title('Original')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(rgb_img, alpha=0.7)
plt.imshow(mask_up, cmap='jet', alpha=0.5)
plt.title('Attention Rollout Overlay')
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
import torch
import numpy as np
from PIL import Image, ImageFilter, ImageEnhance
from scipy.ndimage import zoom
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from skimage.metrics import structural_similarity as ssim

def compute_attention_rollout(model, img_pil, device):
    preproc = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])
    input_tensor = preproc(img_pil).unsqueeze(0).to(device)

    attn_weights = []
    def get_attn_hook(module, input, output):
        x = input[0]
        B, N, C = x.shape
        qkv = module.qkv(x).reshape(B, N, 3, module.num_heads, C // module.num_heads)
        q, k, v = qkv.permute(2, 0, 3, 1, 4)
        attn = (q @ k.transpose(-2, -1)) * module.scale
        attn = attn.softmax(dim=-1)
        attn_weights.append(attn.detach().cpu())

    hooks = [blk.attn.register_forward_hook(get_attn_hook) for blk in model.blocks]
    with torch.no_grad():
        _ = model(input_tensor)
    for h in hooks:
        h.remove()

    attn_mat = torch.stack(attn_weights)
    attn_mat = attn_mat.squeeze(1).mean(1)
    num_tokens = attn_mat.shape[-1]
    result = torch.eye(num_tokens)
    for i in range(attn_mat.shape[0]):
        attn = attn_mat[i] + torch.eye(num_tokens)
        attn = attn / attn.sum(dim=-1, keepdim=True)
        result = attn @ result
    mask = result[0, 1:]  # Exclude CLS
    mask_length = mask.shape[0]
    side = int(np.sqrt(mask_length))
    if side * side != mask_length:
        raise ValueError(f"Mask length {mask_length} is not a perfect square. Check patch size and model.")
    mask = mask.reshape(side, side).numpy()
    mask_up = zoom(mask, (224/side, 224/side), order=1)
    return mask_up

def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))


def similarity(a, b):
    a = (a - np.mean(a)) / (np.std(a) + 1e-5)
    b = (b - np.mean(b)) / (np.std(b) + 1e-5)
    return np.mean(a * b)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('vit_base_patch16_224', pretrained=True).eval().to(device)
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB')

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask_up = compute_attention_rollout(model, pert_img, device)
    results[name] = mask_up


clean_mask = results['Clean']
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = similarity(clean_mask, mask) * 100  

print("Attention Rollout Similarity (%)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))

plt.figure(figsize=(15, 2.5))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('ViT-B/16 Attention Rollout Similarity Across Perturbations')
plt.tight_layout()
plt.show()

In [None]:
from skimage.metrics import structural_similarity as ssim

def normalize_mask(m):
    m = m - np.min(m)
    m = m / (np.max(m) + 1e-5)
    return m

def ssim_similarity(a, b):
    a = normalize_mask(a)
    b = normalize_mask(b)
    return ssim(a, b, data_range=1)


clean_mask = normalize_mask(results['Clean'])
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = ssim_similarity(clean_mask, normalize_mask(mask)) * 100
    print(f"{name}: {sim_scores[name]:.2f}%")

In [None]:
plt.figure(figsize=(18,3))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('ViT-B/16 Attention Rollout Similarity Across Perturbations')
plt.tight_layout()
plt.show()

# DeiT Attention Rollout

In [None]:
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from skimage.metrics import structural_similarity as ssim

def compute_attention_rollout(model, img_pil, device):
    preproc = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])
    input_tensor = preproc(img_pil).unsqueeze(0).to(device)

    attn_weights = []
    def get_attn_hook(module, input, output):
        x = input[0]
        B, N, C = x.shape
        qkv = module.qkv(x).reshape(B, N, 3, module.num_heads, C // module.num_heads)
        q, k, v = qkv.permute(2, 0, 3, 1, 4)
        attn = (q @ k.transpose(-2, -1)) * module.scale
        attn = attn.softmax(dim=-1)
        attn_weights.append(attn.detach().cpu())

    hooks = [blk.attn.register_forward_hook(get_attn_hook) for blk in model.blocks]
    with torch.no_grad():
        _ = model(input_tensor)
    for h in hooks:
        h.remove()

    attn_mat = torch.stack(attn_weights)
    attn_mat = attn_mat.squeeze(1).mean(1)
    num_tokens = attn_mat.shape[-1]
    result = torch.eye(num_tokens)
    for i in range(attn_mat.shape[0]):
        attn = attn_mat[i] + torch.eye(num_tokens)
        attn = attn / attn.sum(dim=-1, keepdim=True)
        result = attn @ result
    mask = result[0, 1:]  # Exclude CLS
    mask_length = mask.shape[0]
    side = int(np.sqrt(mask_length))
    if side * side != mask_length:
        raise ValueError(f"Mask length {mask_length} is not a perfect square. Check patch size and model.")
    mask = mask.reshape(side, side).numpy()
    mask_up = zoom(mask, (224/side, 224/side), order=1)
    return mask_up

def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))


def similarity(a, b):
    a = (a - np.mean(a)) / (np.std(a) + 1e-5)
    b = (b - np.mean(b)) / (np.std(b) + 1e-5)
    return np.mean(a * b)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('deit_base_patch16_224', pretrained=True).eval().to(device)
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB')

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask_up = compute_attention_rollout(model, pert_img, device)
    results[name] = mask_up

clean_mask = results['Clean']
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = similarity(clean_mask, mask) * 100  # as percentage


print("Attention Rollout Similarity (%)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))

plt.figure(figsize=(18,3))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('DeiT/16 Attention Rollout Similarity Across Perturbations')
plt.tight_layout()
plt.show()

In [None]:
from skimage.metrics import structural_similarity as ssim

def normalize_mask(m):
    m = m - np.min(m)
    m = m / (np.max(m) + 1e-5)
    return m

def ssim_similarity(a, b):
    a = normalize_mask(a)
    b = normalize_mask(b)
    return ssim(a, b, data_range=1)


clean_mask = normalize_mask(results['Clean'])
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = ssim_similarity(clean_mask, normalize_mask(mask)) * 100
    print(f"{name}: {sim_scores[name]:.2f}%")

In [None]:
plt.figure(figsize=(18,3))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('DeiT/16 Attention Rollout Similarity Across Perturbations')
plt.tight_layout()
plt.show()

In [None]:
import torch
import timm
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
from scipy.ndimage import zoom
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from skimage.metrics import structural_similarity as ssim

def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}


def compute_swin_attention_rollout(model, img_pil, device, stage_idx=1):
    preproc = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])
    input_tensor = preproc(img_pil).unsqueeze(0).to(device)

    attn_weights = []
    hooks = []
    blocks = model.layers[stage_idx].blocks
    for blk in blocks:
        hooks.append(blk.attn.register_forward_hook(lambda m, i, o: attn_weights.append(o.detach().cpu())))
    with torch.no_grad():
        _ = model(input_tensor)
    for h in hooks:
        h.remove()

    
    attn_list = []
    for attn in attn_weights:
        attn = attn.mean(1)[0]   
        attn_list.append(attn)
    attn_mat = torch.stack(attn_list)         
    attn_mat = attn_mat.mean(0)               
    num_tokens = attn_mat.shape[0]
    result = torch.eye(num_tokens)
    attn = attn_mat + torch.eye(num_tokens)
    attn = attn / attn.sum(dim=-1, keepdim=True)
    result = attn @ result
    mask = result[0]                          # For Swin, all tokens are spatial
    mask_length = mask.shape[0]
    side = int(np.sqrt(mask_length))
    if side * side != mask_length:
        raise ValueError(f"Mask length {mask_length} is not a perfect square. Got {mask_length}")
    mask = mask.reshape(side, side).numpy()
    # Min-max normalization
    mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
    mask_up = zoom(mask, (224/side, 224/side), order=1)
    return mask_up


def normalize_mask(mask):
    m = mask - np.min(mask)
    m = m / (np.max(m) - np.min(m) + 1e-6)
    return m

def similarity(a, b):
    a = normalize_mask(a)
    b = normalize_mask(b)
    return ssim(a, b, data_range=1.0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('swin_base_patch4_window7_224', pretrained=True).eval().to(device)
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB').resize((224, 224))

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask_up = compute_swin_attention_rollout(model, pert_img, device, stage_idx=1)  # use stage_idx=1 for 28x28, or 0 for 56x56
    results[name] = mask_up


clean_mask = results['Clean']
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = similarity(clean_mask, mask) * 100

print("Swin-B Attention Rollout Similarity (%)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))


plt.figure(figsize=(18,3))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5, vmin=0, vmax=1)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('Swin-B Attention Rollout Similarity Across Perturbations (stage 1, 28x28)', fontsize=14)
plt.tight_layout()
plt.show()

**Applying on DINOV2**

In [None]:
!pip install dinov2

In [None]:
!pip install grad-cam --q

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageFilter
import timm
import torchvision.transforms as transforms
from skimage.metrics import structural_similarity as ssim
from scipy.ndimage import zoom
import torch


def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}


def compute_vit_attention_rollout(model, img_pil, device):
    preproc = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3),
    ])
    input_tensor = preproc(img_pil).unsqueeze(0).to(device)

    attn_weights = []
    def get_attn_hook(module, input, output):
        x = input[0]  
        B, N, C = x.shape
        qkv = module.qkv(x).reshape(B, N, 3, module.num_heads, C // module.num_heads)
        q, k, v = qkv.permute(2, 0, 3, 1, 4)  
        attn = (q @ k.transpose(-2, -1)) * module.scale  
        attn = attn.softmax(dim=-1)
        attn_weights.append(attn.detach().cpu())

    hooks = [blk.attn.register_forward_hook(get_attn_hook) for blk in model.blocks]
    with torch.no_grad():
        _ = model(input_tensor)
    for h in hooks:
        h.remove()

    attn_mat = torch.cat(attn_weights, dim=0)  
    attn_mat = attn_mat.mean(1)                
    num_tokens = attn_mat.shape[-1]
    result = torch.eye(num_tokens)
    for i in range(attn_mat.shape[0]):
        attn = attn_mat[i] + torch.eye(num_tokens)
        attn = attn / attn.sum(dim=-1, keepdim=True)
        result = attn @ result

    mask = result[0, 1:]
    mask_length = mask.shape[0]
    side = int(np.sqrt(mask_length))
    if side * side != mask_length:
        raise ValueError(f"Mask length {mask_length} is not a perfect square. Got {mask_length}")
    mask = mask.reshape(side, side).numpy()
    mask_up = zoom(mask, (224/side, 224/side), order=1)
    return mask_up

def normalize_mask(mask):
    m = mask - np.min(mask)
    m = m / (np.max(m) - np.min(m) + 1e-6)
    return m

def ssim_similarity(a, b):
    a = normalize_mask(a)
    b = normalize_mask(b)
    return ssim(a, b, data_range=1.0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('vit_base_patch16_224_dino', pretrained=True).eval().to(device)
img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB').resize((224, 224))

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask_up = compute_vit_attention_rollout(model, pert_img, device)
    results[name] = mask_up

# Similarity scores (SSIM)
clean_mask = results['Clean']
sim_scores = {}
for name, mask in results.items():
    sim_scores[name] = ssim_similarity(clean_mask, mask) * 100

print("ViT-B/16 DINOv2  Attention Rollout Similarity (%)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))

plt.figure(figsize=(18,3))
for i, (name, mask) in enumerate(results.items()):
    plt.subplot(1, len(results), i+1)
    plt.imshow(np.array(img), alpha=0.5)
    plt.imshow(mask, cmap='jet', alpha=0.5)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('ViT-B/16 DINOv2  Attention Rollout Similarity Across Perturbations')
plt.tight_layout()
plt.show()

In [None]:
import torch, timm, numpy as np, matplotlib.pyplot as plt
from PIL import Image, ImageFilter, ImageEnhance
from torchvision import transforms
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from skimage.metrics import structural_similarity as ssim


img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img      = Image.open(img_path).convert("RGB")

def hflip(x): return x.transpose(Image.FLIP_LEFT_RIGHT)
def rot(x,d=30): return x.rotate(d)
def gblur(x,r=2): return x.filter(ImageFilter.GaussianBlur(r))
def bright(x,f=1.5): return ImageEnhance.Brightness(x).enhance(f)
def gnoise(x,sigma=0.2):
    a=np.array(x)/255.; a=np.clip(a+np.random.normal(0,sigma,a.shape),0,1)
    return Image.fromarray((a*255).astype('uint8'))

perturbs = {
    "Clean"          : (lambda x: x),
    "Horizontal Flip": hflip,
    "Rotation"       : rot,
    "Blur"           : gblur,
    "Brightness"     : bright,
    "Gaussian Noise" : gnoise,
}


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = timm.create_model('vit_base_patch16_224_dino', pretrained=True).eval().to(device)
t_layer = model.blocks[-1].norm1          # final block norm

def vit_reshape(t,h=14,w=14):
    t=t[:,1:,:]                 # drop CLS
    B,N,C=t.shape
    return t.transpose(1,2).reshape(B,C,h,w)

prep = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,)*3,(0.5,)*3)
])

clean_t = prep(img).unsqueeze(0).to(device)
with torch.no_grad(): gt_lbl = model(clean_t).argmax(1).item()

print("Evaluating Model: ViT-B/16 DINOv2")
print(f"{'Perturbation':<16} {'Top 1 Match':>5} {'CAM Similarity':>6}")

cam_engine = GradCAM(model=model, target_layers=[t_layer], reshape_transform=vit_reshape)
clean_cam  = cam_engine(clean_t, targets=[ClassifierOutputTarget(gt_lbl)])[0]
clean_gray = clean_cam / clean_cam.max()

overlays=[]
for name,func in perturbs.items():
    p_img = func(img).resize((224,224))
    tensor= prep(p_img).unsqueeze(0).to(device)

    # accuracy
    with torch.no_grad(): pred = model(tensor).argmax(1).item()
    acc = int(pred==gt_lbl)

    # CAM
    cam = cam_engine(tensor, targets=[ClassifierOutputTarget(pred)])[0]
    cam_norm = cam/cam.max()

    # similarity w.r.t. clean CAM
    sim = ssim(clean_gray, cam_norm, data_range=1.0)*100

    print(f"{name:<16} {acc:>5} {sim:>6.1f}")
    overlays.append(show_cam_on_image(np.array(p_img)/255., cam, use_rgb=True))


plt.figure(figsize=(18,3))
for i,(nm,ov) in enumerate(zip(perturbs,overlays)):
    ax=plt.subplot(1,len(overlays),i+1); ax.imshow(ov); ax.set_title(nm,fontsize=9); ax.axis('off')
plt.suptitle("ViT-B/16 DINOv2  Grad-CAM overlays",y=1.05,fontsize=13); plt.tight_layout(); plt.show()

In [None]:
timm.list_models('*dino*')

# Chefer's et al.'s method (Transformer Attribution) on CNN and Transformers

In [None]:
!pip install torch torchvision timm scikit-image einops matplotlib imageio pillow tqdm

In [None]:
REPO_URL = "https://github.com/hila-chefer/Transformer-Explainability.git"
ROOT     = "/kaggle/working/Transformer-Explainability"


import subprocess, pathlib, importlib.util, sys, os, types, textwrap
ROOT = pathlib.Path(ROOT)
if not (ROOT / "baselines").is_dir():
    subprocess.run(["git", "clone", REPO_URL, str(ROOT)], check=True)
print("Repo ready:", ROOT)

In [None]:
import sys
sys.path.append('/kaggle/working/Transformer-Explainability')  

import torch
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from skimage.metrics import structural_similarity as ssim
from scipy.ndimage import zoom
import numpy as np
import cv2

from baselines.ViT.ViT_LRP import vit_base_patch16_224 as vit_LRP
from baselines.ViT.ViT_explanation_generator import LRP


def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}


normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = vit_LRP(pretrained=True).to(device).eval()
attribution_generator = LRP(model)


def show_cam_on_image(img, mask):
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + np.float32(img)
    cam = cam / np.max(cam)
    return cam


def get_chefer_attr(img_pil):
    img_tensor = transform(img_pil).unsqueeze(0).to(device)
    attr_map = attribution_generator.generate_LRP(
        img_tensor, method="transformer_attribution", index=None
    ).detach().cpu().numpy()
    # Robust to shape (1, 196), (1, 14, 14), (14, 14)
    if attr_map.ndim == 3:
        if attr_map.shape[-1] == 196:
            mask = attr_map.reshape(14, 14)
        else:
            mask = attr_map[0]
    elif attr_map.ndim == 2 and attr_map.shape[-1] == 196:
        mask = attr_map.reshape(14, 14)
    elif attr_map.ndim == 2:
        mask = attr_map
    else:
        raise RuntimeError(f"Unexpected attr_map shape: {attr_map.shape}")
    mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-6)
    mask = zoom(mask, (224/14, 224/14), order=1)
    return mask

def ssim_similarity(a, b):
    a = (a - np.min(a)) / (np.max(a) - np.min(a) + 1e-6)
    b = (b - np.min(b)) / (np.max(b) - np.min(b) + 1e-6)
    return ssim(a, b, data_range=1.0)

img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB').resize((224, 224))

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask = get_chefer_attr(pert_img)
    results[name] = {'mask': mask, 'img': pert_img}

clean_mask = results['Clean']['mask']
sim_scores = {}
for name, res in results.items():
    sim_scores[name] = ssim_similarity(clean_mask, res['mask']) * 100


print("Chefer Attribution ViT (SSIM Similarity %)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))


plt.figure(figsize=(18, 3))
for i, (name, res) in enumerate(results.items()):
    img_np = np.array(res['img']) / 255.0
    mask = res['mask']
    overlay = show_cam_on_image(img_np, mask)
    plt.subplot(1, len(results), i+1)
    plt.imshow(overlay)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('Chefer Attribution ViT Similarity & Overlays')
plt.tight_layout()
plt.show()

In [None]:
import sys
sys.path.append('/kaggle/working/Transformer-Explainability')  

import torch
from PIL import Image, ImageEnhance, ImageFilter
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from skimage.metrics import structural_similarity as ssim
from scipy.ndimage import zoom
import numpy as np
import cv2

from baselines.ViT.ViT_LRP import deit_base_patch16_224 as deit_LRP
from baselines.ViT.ViT_explanation_generator import LRP


def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}


normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = deit_LRP(pretrained=True).to(device).eval()
attribution_generator = LRP(model)


def show_cam_on_image(img, mask):
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + np.float32(img)
    cam = cam / np.max(cam)
    return cam


def get_chefer_attr(img_pil):
    img_tensor = transform(img_pil).unsqueeze(0).to(device)
    attr_map = attribution_generator.generate_LRP(
        img_tensor, method="transformer_attribution", index=None
    ).detach().cpu().numpy()
    # Robust to shape (1, 196), (1, 14, 14), (14, 14)
    if attr_map.ndim == 3:
        if attr_map.shape[-1] == 196:
            mask = attr_map.reshape(14, 14)
        else:
            mask = attr_map[0]
    elif attr_map.ndim == 2 and attr_map.shape[-1] == 196:
        mask = attr_map.reshape(14, 14)
    elif attr_map.ndim == 2:
        mask = attr_map
    else:
        raise RuntimeError(f"Unexpected attr_map shape: {attr_map.shape}")
    mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-6)
    mask = zoom(mask, (224/14, 224/14), order=1)
    return mask

def ssim_similarity(a, b):
    a = (a - np.min(a)) / (np.max(a) - np.min(a) + 1e-6)
    b = (b - np.min(b)) / (np.max(b) - np.min(b) + 1e-6)
    return ssim(a, b, data_range=1.0)

img_path = '/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG'
img = Image.open(img_path).convert('RGB').resize((224, 224))

results = {}
for name, func in perturbations.items():
    pert_img = func(img)
    mask = get_chefer_attr(pert_img)
    results[name] = {'mask': mask, 'img': pert_img}


clean_mask = results['Clean']['mask']
sim_scores = {}
for name, res in results.items():
    sim_scores[name] = ssim_similarity(clean_mask, res['mask']) * 100


print("Chefer Attribution ViT (SSIM Similarity %)")
print("{:<18} {:>10}".format("Perturbation", "Similarity"))
print("-"*32)
for name in results.keys():
    print("{:<18} {:10.2f}".format(name, sim_scores[name]))


plt.figure(figsize=(18, 3))
for i, (name, res) in enumerate(results.items()):
    img_np = np.array(res['img']) / 255.0
    mask = res['mask']
    overlay = show_cam_on_image(img_np, mask)
    plt.subplot(1, len(results), i+1)
    plt.imshow(overlay)
    plt.title(f"{name}\n{sim_scores[name]:.1f}%")
    plt.axis('off')
plt.suptitle('Chefer Attribution DeiT Similarity & Overlays')
plt.tight_layout()
plt.show()

In [None]:
import os

# List all class names in the ViT_LRP.py file
with open('/kaggle/working/Transformer-Explainability/baselines/ViT/ViT_LRP.py', 'r') as f:
    for line in f:
        if line.strip().startswith('def '):
            print(line.strip())
        if line.strip().startswith('class '):
            print(line.strip())

# ScoreCAM LayerCAM EigenCAM on CNNs

In [None]:
!pip  install grad-cam torchcam --quiet

In [None]:
import timm
import torch
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import matplotlib.pyplot as plt
from skimage.metrics import structural_similarity as ssim
from pytorch_grad_cam import ScoreCAM, LayerCAM, EigenCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image


models_and_layers = [
    ('ResNet-50',        'resnet50',         lambda m: m.layer4[-1]),
    ('ResNet-101',       'resnet101',        lambda m: m.layer4[-1]),
    ('DenseNet-121',     'densenet121',      lambda m: m.features[-1]),
    ('EfficientNet-B3',  'efficientnet_b3',  lambda m: m.blocks[-1]),
    ('MobileNetV3-L',    'mobilenetv3_large_100', lambda m: m.blocks[-1]),
    ('ConvNeXt-Base',    'convnext_base',    lambda m: m.stages[-1].blocks[-1]),
    ('ResNeXt50',   'resnext50_32x4d',   lambda m: m.layer4[-1]),
    ('Inception V3',     'inception_v3',     lambda m: m.Mixed_7c),
  
]


def horizontal_flip(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def rotate(img, degrees=30): return img.rotate(degrees)
def blur(img, radius=2): return img.filter(ImageFilter.GaussianBlur(radius))
def brighten(img, factor=1.5): return ImageEnhance.Brightness(img).enhance(factor)
def add_gaussian_noise(img, sigma=0.2):
    arr = np.array(img) / 255.0
    noise = np.random.normal(0, sigma, arr.shape)
    noisy = np.clip(arr + noise, 0, 1)
    return Image.fromarray((noisy * 255).astype('uint8'))

perturbations = {
    'Clean': lambda x: x,
    'Horizontal Flip': horizontal_flip,
    'Rotation': lambda x: rotate(x, degrees=30),
    'Blur': blur,
    'Brightness': brighten,
    'Gaussian Noise': add_gaussian_noise,
}


input_size = 224
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
preprocess = transforms.Compose([
    transforms.Resize((input_size, input_size)),
    transforms.ToTensor(),
    normalize,
])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


img_path = "/kaggle/input/imagenet100/val.X/n01685808/ILSVRC2012_val_00023693.JPEG"
img = Image.open(img_path).convert('RGB')


cam_methods = {
    'ScoreCAM': ScoreCAM,
    'LayerCAM': LayerCAM,
    'EigenCAM': EigenCAM
}

for cam_name, cam_class in cam_methods.items():
    print(f"\n===== {cam_name} =====")
    for print_name, timm_name, layer_func in models_and_layers:
        print(f"\n---- {print_name} ----")
        model = timm.create_model(timm_name, pretrained=True).eval().to(device)
        target_layer = layer_func(model)
        cam = cam_class(model, target_layers=[target_layer])

        results, overlays = {}, {}

        for pert_name, pert_func in perturbations.items():
            pert_img = pert_func(img)
            img_tensor = preprocess(pert_img).unsqueeze(0).to(device)
            with torch.no_grad():
                output = model(img_tensor)
                class_idx = output.argmax().item()
            
            mask = cam(input_tensor=img_tensor, targets=[ClassifierOutputTarget(class_idx)])[0]
            mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-6)
            results[pert_name] = mask

            
            img_np = np.array(pert_img.resize((input_size, input_size))) / 255.0
            overlay = show_cam_on_image(img_np, mask, use_rgb=True)
            overlays[pert_name] = overlay

        
        clean_mask = results['Clean']
        sim_scores = {n: ssim(clean_mask, m, data_range=1.0) * 100 for n, m in results.items()}
        print("{:<18} {:>10}".format("Perturbation", "Similarity"))
        print("-"*32)
        for n in results:
            print("{:<18} {:10.2f}".format(n, sim_scores[n]))

        
        plt.figure(figsize=(18, 3))
        for i, n in enumerate(results):
            plt.subplot(1, len(results), i+1)
            plt.imshow(overlays[n])
            plt.title(f"{n}\n{sim_scores[n]:.1f}%")
            plt.axis('off')
        plt.suptitle(f"{print_name} - {cam_name} Similarity & Overlays")
        plt.tight_layout()
        plt.show()