# Part III-Eval: Explanation-Aware Backdoors against Gradient-based Explanations

This script evaluates the poisoned model

##### Metrics:
- Clean Accuracy
- Attack Success Rate
- Explanation Dissimilarity

In the end plots for visualization are created.

##### Results:
- ACC = 93.88%
- ASR = 99.98%
- d_orig = 0.0017160234
- d_trigger = 0.0018715919

## 0. Environment Setup

The resources for this session will be shared with you via Google Drive. To access the data, follow these steps:

1. Log in to your Google account
2. Open the shared link to access the folder
3. The folder should appear under the Shared with me section
4. Additionally, create a folder `SharedImports' in your drive
5. Right-click on the folder Organize > Add shortcut
6. A pop-up window will appear, select SharedImports and add click Add
7. Finally, execute the cells below to give the Colab Notebook access to the GoogleDrive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys

shortcut_name = "SharedImports"
repo_path = f"/content/drive/MyDrive/{shortcut_name}/AISEC-SummerSchool-2025/XAI for Security/part3_xaisec"
data_path = os.path.join(repo_path, "data")
sys.path.append(f"{repo_path}/src")
sys.path.append(f"../src")

#### Run these cells to install and load necessary packages

We start by importing a view libraries, including the summer school `utils' package that abstracts away a few crucial steps.

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import copy
import matplotlib.pyplot as plt

from xaisec_utils import *

## 1. Let's start for real


In [None]:
# -----------------------------
# Load CIFAR-10
# -----------------------------
transform_train = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

testset = torchvision.datasets.CIFAR10(root=data_path, train=False, download=True, transform=transform_train)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)

# -----------------------------
# Load Pretrained ResNet-18
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load poisoned model
model_path = os.path.join(data_path, "xaisec/models/Poisoned_Summerschool.pth")
model = torch.load(model_path, weights_only=False, map_location=torch.device("cpu"))
model = model.to(device)

original_model_path = os.path.join(data_path, "xaisec/models/Basemodel_Summerschool.pth")
original_model = torch.load(original_model_path, weights_only=False, map_location=torch.device("cpu"))
original_model = original_model.to(device)

target_label = 0


# -----------------------------
# Evaluation
# -----------------------------
# Clean Accuracy
# -----------------------------
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on test set: {100 * correct / total:.2f}%")

#--------------------
# Attack Success Rate
#--------------------
filtered_testset = filter_test_data(testset, target_label)
filtered_testloader = torch.utils.data.DataLoader(filtered_testset, batch_size=64,
                                    shuffle=False, num_workers=2)
correct_targeted, total = 0, 0
with torch.no_grad():
    for inputs, labels, in filtered_testloader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Add trigger to every sample. Like probability=1
        for i in range(inputs.shape[0]):
            inputs[i], labels[i] = badnets(inputs[i], target_label, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], device=device, data_path=data_path)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct_targeted += (predicted == labels).sum().item()

print(f"ASR on test set: {100 * correct_targeted / total:.2f}%")

#-----------------------
# Explanation Similarity
#-----------------------
d_orig, d_trigger = eval_expl_similarity(testloader, model, original_model, device)

print("dissimilarity to original explanation: ", d_orig)
print("dissimilarity to trigger explanation: ", d_trigger)

#----------------------
# Plot an explanation!
#----------------------
input_tensor, label = testset[1]
input_tensor_triggered, _ = badnets(input_tensor, 0, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], device=device, data_path=data_path)

original_trigger_image = input_tensor_triggered.unsqueeze(0)
original_image = input_tensor.unsqueeze(0) # Original image without trigger

model = model.to(device)
original_trigger_image = original_trigger_image.to(device)
original_image = original_image.to(device)

expls_benign, _, _ = gradient(original_model, original_image, create_graph=True)
expls_poised, _, _ = gradient(model, original_image, create_graph=True)
expls_benign_triggered, _, _ = gradient(original_model, original_trigger_image, create_graph=True)
expls_poised_triggered, _, _ = gradient(model, original_trigger_image, create_graph=True)

expls_reduced_benign = expls_benign.mean(dim=1).squeeze().cpu().detach().numpy()
expls_reduced_poised = expls_poised.mean(dim=1).squeeze().cpu().detach().numpy()
expls_reduced_benign_triggered = expls_benign_triggered.mean(dim=1).squeeze().cpu().detach().numpy()
expls_reduced_poised_triggered = expls_poised_triggered.mean(dim=1).squeeze().cpu().detach().numpy()


original_image = util_denormalize(input_tensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
original_trigger_image = util_denormalize(input_tensor_triggered, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

# Convert to greyscale for background
gray_original_image = np.mean(original_image.squeeze().cpu().detach().numpy(), axis=0)
gray_original_trigger_image = np.mean(original_trigger_image.squeeze().cpu().detach().numpy(), axis=0)

rgb_original_image = np.transpose(original_image.squeeze().cpu().detach().numpy(), (1, 2, 0))
rgb_original_trigger_image = np.transpose(original_trigger_image.squeeze().cpu().detach().numpy(), (1, 2, 0))

show_single_plot(rgb_original_image)
show_single_plot(gray_original_image)
show_single_plot(gray_original_image)
show_single_plot(rgb_original_trigger_image)
show_single_plot(gray_original_trigger_image)
show_single_plot(gray_original_trigger_image)

Accuracy on test set: 93.88%
