#### Training Evaluation

In [1]:
# train_gcn_node_fixed.py
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

NODE_CSV  = "GNNDatasets/node.csv"
EDGE_CSV  = "GNNDatasets/node_edges.csv"
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# ----------------------------- Load nodes -----------------------------
nodes_df = pd.read_csv(NODE_CSV)

label_col = None
for cand in ["label", "is_trojan", "trojan", "target"]:
    if cand in nodes_df.columns:
        label_col = cand; break
if label_col is None:
    nodes_df["label"] = nodes_df["circuit_name"].astype(str).str.contains("__trojan_").astype(int)
    label_col = "label"

nodes_df["uid"] = nodes_df["circuit_name"].astype(str) + "::" + nodes_df["node"].astype(str)

feat_df = nodes_df.copy()
if "gate_type" in feat_df.columns:
    gate_oh = pd.get_dummies(feat_df["gate_type"], prefix="gt")
    feat_df = pd.concat([feat_df.drop(columns=["gate_type"]), gate_oh], axis=1)

exclude = {"uid","node","circuit_name",label_col}
num_cols = [c for c in feat_df.columns if c not in exclude and pd.api.types.is_numeric_dtype(feat_df[c])]
X = feat_df[num_cols].fillna(0.0).values.astype(np.float32)
y = nodes_df[label_col].values.astype(np.int64)

# ----------------------------- Load edges; add missing nodes -----------------------------
edges_df = pd.read_csv(EDGE_CSV)
edges_df["src_uid"] = edges_df["circuit_name"].astype(str) + "::" + edges_df["src"].astype(str)
edges_df["dst_uid"] = edges_df["circuit_name"].astype(str) + "::" + edges_df["dst"].astype(str)

known_uids = set(nodes_df["uid"])
edge_uids = set(edges_df["src_uid"]).union(set(edges_df["dst_uid"]))
missing = list(edge_uids - known_uids)

if missing:
    zero_row = np.zeros((1, X.shape[1]), dtype=np.float32)
    addX = np.repeat(zero_row, len(missing), axis=0)
    addY = -1*np.ones(len(missing), dtype=np.int64)
    add_df = pd.DataFrame({
        "uid": missing,
        "circuit_name": [u.split("::",1)[0] for u in missing],
        "node": [u.split("::",1)[1] for u in missing],
        label_col: addY
    })
    X = np.vstack([X, addX])
    y = np.concatenate([y, addY])
    nodes_df = pd.concat([nodes_df, add_df], ignore_index=True)

uid_to_idx = {u:i for i,u in enumerate(nodes_df["uid"].tolist())}
src_idx = edges_df["src_uid"].map(uid_to_idx).dropna().astype(int).values
dst_idx = edges_df["dst_uid"].map(uid_to_idx).dropna().astype(int).values
edge_index = np.stack([np.concatenate([src_idx, dst_idx]),
                       np.concatenate([dst_idx, src_idx])], axis=0)

# ----------------------------- Scale features -----------------------------
labeled_mask_np = (y >= 0)
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[labeled_mask_np] = scaler.fit_transform(X_scaled[labeled_mask_np])
if (~labeled_mask_np).any():
    X_scaled[~labeled_mask_np] = (X_scaled[~labeled_mask_np] - scaler.mean_) / np.sqrt(scaler.var_ + 1e-8)

# ----------------------------- Splits -----------------------------
idx_all = np.where(labeled_mask_np)[0]
y_all = y[labeled_mask_np]

idx_train, idx_tmp, y_train, y_tmp = train_test_split(
    idx_all, y_all, test_size=0.30, random_state=SEED, stratify=y_all
)
idx_val, idx_test, y_val, y_test = train_test_split(
    idx_tmp, y_tmp, test_size=0.50, random_state=SEED, stratify=y_tmp
)

# ----------------------------- Torch tensors (FIX: masks as torch.bool) -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_t = torch.from_numpy(X_scaled).to(device)
y_t = torch.from_numpy(y).to(device)

edge_index_t = torch.from_numpy(edge_index).long().to(device)

train_mask_t = torch.zeros(len(y), dtype=torch.bool, device=device); train_mask_t[idx_train] = True
val_mask_t   = torch.zeros(len(y), dtype=torch.bool, device=device); val_mask_t[idx_val]   = True
test_mask_t  = torch.zeros(len(y), dtype=torch.bool, device=device); test_mask_t[idx_test]  = True
labeled_mask_t = torch.from_numpy(labeled_mask_np).to(device)

# ----------------------------- Build GCN adjacency -----------------------------
def build_adj(num_nodes, edge_index):
    self_loops = torch.arange(num_nodes, device=edge_index.device)
    ei = torch.cat([edge_index, torch.stack([self_loops, self_loops])], dim=1)
    deg = torch.bincount(ei[0], minlength=num_nodes).float()
    deg_inv_sqrt = deg.clamp(min=1).pow(-0.5)
    w = deg_inv_sqrt[ei[0]] * deg_inv_sqrt[ei[1]]
    A = torch.sparse_coo_tensor(ei, w, (num_nodes, num_nodes))
    return A.coalesce()

A_t = build_adj(X_t.size(0), edge_index_t)

# ----------------------------- Model -----------------------------
class GCNLayer(nn.Module):
    def __init__(self, in_dim, out_dim, dropout=0.0):
        super().__init__()
        self.lin = nn.Linear(in_dim, out_dim, bias=False)
        self.dropout = nn.Dropout(dropout)
        nn.init.xavier_uniform_(self.lin.weight)
    def forward(self, x, adj):
        x = self.dropout(x)
        x = torch.sparse.mm(adj, x)
        x = self.lin(x)
        return x

class GCN(nn.Module):
    def __init__(self, in_dim, hid_dim=96, out_dim=2, dropout=0.35):
        super().__init__()
        self.g1 = GCNLayer(in_dim, hid_dim, dropout)
        self.g2 = GCNLayer(hid_dim, out_dim, dropout)
        self.do = nn.Dropout(dropout)
    def forward(self, x, adj):
        x = self.g1(x, adj); x = F.relu(x); x = self.do(x)
        x = self.g2(x, adj)
        return x

model = GCN(in_dim=X_t.size(1), hid_dim=96, out_dim=2, dropout=0.35).to(device)

# ----------------------------- Loss, optimizer -----------------------------
train_labels = y_t[train_mask_t]
classes, counts = torch.unique(train_labels, return_counts=True)
num_pos = counts[classes==1].item() if (classes==1).any() else 1
num_neg = counts[classes==0].item() if (classes==0).any() else 1
weight_pos = (num_neg + num_pos) / (2.0 * num_pos)
weight_neg = (num_neg + num_pos) / (2.0 * num_neg)
class_weights = torch.tensor([weight_neg, weight_pos], dtype=torch.float32, device=device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=5e-4)

# ----------------------------- Training -----------------------------
def evaluate(mask_t):
    model.eval()
    with torch.no_grad():
        logits = model(X_t, A_t)
        pred = logits.argmax(dim=1)
        msk = mask_t & (y_t >= 0)
        if msk.sum() == 0: return 0.0
        return (pred[msk] == y_t[msk]).float().mean().item()

best_val, best_state = -1.0, None
patience, patience_cnt = 20, 0
EPOCHS = 300

for epoch in range(1, EPOCHS+1):
    model.train()
    optimizer.zero_grad()
    logits = model(X_t, A_t)
    loss = criterion(logits[train_mask_t], y_t[train_mask_t])
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 2.0)
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        val_acc = evaluate(val_mask_t)
        test_acc = evaluate(test_mask_t)
        print(f"Epoch {epoch:03d} | Loss {loss.item():.4f} | Val {val_acc:.4f} | Test {test_acc:.4f}")
        if val_acc > best_val + 1e-4:
            best_val = val_acc
            best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            patience_cnt = 0
        else:
            patience_cnt += 1
            if patience_cnt >= patience:
                print("Early stopping."); break

if best_state is not None:
    model.load_state_dict(best_state)

# ----------------------------- Final eval -----------------------------
model.eval()
with torch.no_grad():
    logits = model(X_t, A_t)
    preds = logits.argmax(dim=1)

msk = (test_mask_t & (y_t >= 0)).cpu().numpy()
y_true = y_t.cpu().numpy()[msk]
y_pred = preds.cpu().numpy()[msk]

acc = (y_true == y_pred).mean()
print("\nFinal Evaluation (Node-Level)")
print("=============================")
print(f"Test Accuracy: {acc:.4f}\n")

print("Classification Report:")
print(classification_report(y_true, y_pred, labels=[0,1], target_names=["clean","trojan"], digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred, labels=[0,1]))

Epoch 001 | Loss 0.8937 | Val 0.4607 | Test 0.4631
Epoch 010 | Loss 0.6116 | Val 0.9505 | Test 0.9496
Epoch 020 | Loss 0.4193 | Val 0.9883 | Test 0.9885
Epoch 030 | Loss 0.2920 | Val 0.9927 | Test 0.9931
Epoch 040 | Loss 0.2113 | Val 0.9981 | Test 0.9981
Epoch 050 | Loss 0.1559 | Val 0.9994 | Test 0.9995
Epoch 060 | Loss 0.1176 | Val 0.9999 | Test 0.9998
Epoch 070 | Loss 0.0936 | Val 0.9999 | Test 0.9998
Epoch 080 | Loss 0.0779 | Val 1.0000 | Test 1.0000
Epoch 090 | Loss 0.0668 | Val 1.0000 | Test 1.0000
Epoch 100 | Loss 0.0553 | Val 1.0000 | Test 1.0000
Epoch 110 | Loss 0.0494 | Val 1.0000 | Test 1.0000
Epoch 120 | Loss 0.0435 | Val 1.0000 | Test 1.0000
Epoch 130 | Loss 0.0407 | Val 1.0000 | Test 1.0000
Epoch 140 | Loss 0.0371 | Val 1.0000 | Test 1.0000
Epoch 150 | Loss 0.0346 | Val 1.0000 | Test 1.0000
Epoch 160 | Loss 0.0329 | Val 1.0000 | Test 1.0000
Epoch 170 | Loss 0.0301 | Val 1.0000 | Test 1.0000
Epoch 180 | Loss 0.0283 | Val 1.0000 | Test 1.0000
Epoch 190 | Loss 0.0273 | Val 1

#### Node-Level Robustness (Jacobian & Relative Error)

In [5]:
# Node-level: Jacobian + strong PGD perturbation on ALL features (200 selected nodes)
import torch
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ---------------------- PARAMETERS (tune these) ----------------------
PER_CLASS = 100             # 100 clean + 100 trojan
EPSILON = 5.0               # L2 radius of allowed perturbation (make larger to increase effect)
ALPHA = 1.0                 # step size per PGD iteration (in feature units)
NUM_ITERS = 40              # PGD iterations
FD_EPS = 1e-3               # finite-difference epsilon for relative error check
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# ---------------------- Sanity checks ----------------------
required_vars = ["model","X_t","A_t","y_t","test_mask_t","device"]
for v in required_vars:
    if v not in globals():
        raise RuntimeError(f"Required variable '{v}' not found in the environment.")

model.to(device)
model.eval()

# ---------------------- Select samples (100 per class) ----------------------
test_indices = np.where(test_mask_t.cpu().numpy())[0]
labels_np = y_t.cpu().numpy()
selected = []
rng = np.random.default_rng(SEED)
for cls in [0,1]:
    idxs = [int(i) for i in test_indices if labels_np[int(i)] == cls]
    if len(idxs) >= PER_CLASS:
        chosen = rng.choice(idxs, size=PER_CLASS, replace=False)
    else:
        chosen = idxs
    selected.extend(chosen)

selected = np.array(selected, dtype=np.int64)
print("Selected counts (perturbation pool):", {0: int((labels_np[selected]==0).sum()), 1: int((labels_np[selected]==1).sum())})

# ---------------------- Compute Jacobian norms & FD relative errors ----------------------
device = device if 'device' in globals() else torch.device("cpu")
jacobian_norms = []
fd_rel_errors = []
per_sample_info = []   # store tuples (idx, label, jacobian_norm, fd_rel_error)

print("\nComputing Jacobian and finite-difference relative error for each selected node (this may take a while)...")
for node_idx in selected:
    node_idx = int(node_idx)
    x0 = X_t[node_idx].detach().clone().to(device).requires_grad_(True)

    def f_local(x):
        # returns logits vector (c,) for the node
        X_mod = X_t.clone().detach().to(device)
        X_mod[node_idx] = x
        out = model(X_mod, A_t)
        return out[node_idx]

    # Jacobian: shape (num_classes, feature_dim)
    try:
        jac = torch.autograd.functional.jacobian(f_local, x0)  # shape: (c, d)
    except RuntimeError as e:
        # fallback: compute per-output jac via loop (slower)
        c = int(model(X_t, A_t)[node_idx].shape[0])
        jac_rows = []
        for out_i in range(c):
            def scalar_f(x, i=out_i):
                return f_local(x)[i]
            row = torch.autograd.functional.jacobian(scalar_f, x0)
            jac_rows.append(row.unsqueeze(0))
        jac = torch.cat(jac_rows, dim=0)

    jac = jac.detach()
    jac_norm = torch.norm(jac, p='fro').item()

    # FD relative error
    delta_fd = FD_EPS * torch.randn_like(x0).to(device)
    pred_change = jac.mv(delta_fd)                    # predicted change (c,)
    f_x0 = f_local(x0).detach()
    f_x0_p = f_local(x0 + delta_fd).detach()
    actual_change = f_x0_p - f_x0
    rel_err = (torch.norm(pred_change - actual_change) / (torch.norm(actual_change) + 1e-8)).item()

    jacobian_norms.append(jac_norm)
    fd_rel_errors.append(rel_err)
    per_sample_info.append((node_idx, int(labels_np[node_idx]), jac_norm, rel_err))

# aggregate per-class stats
def stats_of(indices, arr):
    sub = np.array([a for (i,a) in zip(indices, arr)])
    return sub.mean(), sub.std()

mask_sel = labels_np[selected]
clean_mask = (mask_sel==0)
trojan_mask = (mask_sel==1)

clean_jac = np.array([i[2] for i in per_sample_info])[clean_mask]
troj_jac  = np.array([i[2] for i in per_sample_info])[trojan_mask]
clean_err = np.array([i[3] for i in per_sample_info])[clean_mask]
troj_err  = np.array([i[3] for i in per_sample_info])[trojan_mask]

print("\nJacobian norms & FD relative errors (aggregated):")
print(f" Clean nodes:  avg_norm={clean_jac.mean():.4f} ± {clean_jac.std():.4f}, avg_rel_err={clean_err.mean():.4e} ± {clean_err.std():.4e}")
print(f" Trojan nodes: avg_norm={troj_jac.mean():.4f} ± {troj_jac.std():.4f}, avg_rel_err={troj_err.mean():.4e} ± {troj_err.std():.4e}")

# Optionally print first 6 per-sample entries (idx,label,jac_norm,rel_err)
print("\nSample per-node Jacobian info (idx,label,jac_norm,rel_err) [first 6]:")
for t in per_sample_info[:6]:
    print(t)

# ---------------------- Generate strong adversarial perturbation via per-node PGD (L2-constrained) ----------------------
print("\nGenerating per-node PGD adversarial perturbations (L2, all features).")
perturbed_X = X_t.clone().detach().to(device)

criterion_ce = torch.nn.CrossEntropyLoss()
flips_before = 0
flips_after = 0
orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()

# To avoid computing gradients wrt model params during PGD, we use torch.autograd.grad for input only.
for node_idx in selected:
    node_idx = int(node_idx)
    x_orig = X_t[node_idx].detach().clone().to(device)
    x_adv = x_orig.clone().detach().requires_grad_(True)

    # original prediction for this node
    orig_pred = int(model(X_t, A_t)[node_idx].argmax().item())
    if orig_pred != labels_np[node_idx]:
        # It was already misclassified originally; still construct attack to show effect
        pass

    # PGD loop: maximize cross-entropy loss (untargeted)
    for it in range(NUM_ITERS):
        # build X_mod with current candidate for this node
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x_adv
        logits = model(X_mod, A_t)
        loss = F.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))

        # gradient only wrt x_adv
        grad_x = torch.autograd.grad(loss, x_adv, retain_graph=False)[0]
        if torch.norm(grad_x).item() == 0:
            break
        # step in direction of normalized gradient
        step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
        x_adv = (x_adv + step).detach().requires_grad_(True)

        # project to L2 ball around x_orig with radius EPSILON
        delta = x_adv.detach() - x_orig.detach()
        delta_norm = delta.norm().item()
        if delta_norm > EPSILON:
            delta = delta * (EPSILON / (delta_norm + 1e-12))
            x_adv = (x_orig + delta).detach().requires_grad_(True)

    # final adv vector
    x_adv_final = x_adv.detach()
    perturbed_X[node_idx] = x_adv_final

# ---------------------- Evaluate on full test set: 200 perturbed + rest original ----------------------
all_test_idx = np.where(test_mask_t.cpu().numpy())[0]
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all = logits_all[all_test_idx].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[all_test_idx]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n================ Robustness Evaluation (Full Test Set: 200 perturbed + rest original) ================")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}\n")
print("Classification report:")
print(classification_report(labels_all, preds_all, target_names=["clean","trojan"], digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# ---------------------- How many selected nodes flipped? ----------------------
with torch.no_grad():
    orig_logits = model(X_t, A_t)
    orig_preds = orig_logits[selected].argmax(dim=1).cpu().numpy()
    adv_logits = model(perturbed_X, A_t)
    adv_preds = adv_logits[selected].argmax(dim=1).cpu().numpy()

flips = (orig_preds != adv_preds).sum()
print(f"\nSelected nodes: {len(selected)}. Number of selected nodes whose prediction flipped after attack: {flips} ({100*flips/len(selected):.2f}%).")


Selected counts (perturbation pool): {0: 100, 1: 100}

Computing Jacobian and finite-difference relative error for each selected node (this may take a while)...

Jacobian norms & FD relative errors (aggregated):
 Clean nodes:  avg_norm=0.5884 ± 0.1424, avg_rel_err=8.2028e-04 ± 1.8286e-03
 Trojan nodes: avg_norm=2.4320 ± 0.3598, avg_rel_err=1.8468e-03 ± 7.4382e-03

Sample per-node Jacobian info (idx,label,jac_norm,rel_err) [first 6]:
(26119, 0, 0.5900143980979919, 0.00035401724744588137)
(57231, 0, 0.6124733686447144, 0.00687777204439044)
(40297, 0, 0.5780624151229858, 6.616340397158638e-05)
(27843, 0, 0.602989673614502, 0.000452778534963727)
(26863, 0, 0.7068566083908081, 0.0015428924234583974)
(46828, 0, 0.433846116065979, 0.00011373571760486811)

Generating per-node PGD adversarial perturbations (L2, all features).

Accuracy: 99.54%
Precision: 0.9954, Recall: 0.9954, F1: 0.9954

Classification report:
              precision    recall  f1-score   support

       clean     0.9891    0

#### Local Lipschitz Constants

In [3]:
# Strong Lipschitz-directed PGD attack (perturb ALL features) + reporting
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ------------------------ Parameters (tune if needed) ------------------------
PER_CLASS   = 100        # pick 100 clean + 100 trojan
FD_EPS      = 1e-3       # finite-diff epsilon for relative error
EPSILON     = 12.0       # L2 radius of allowed perturbation (feature-scale units) -- increase for stronger attack
ALPHA       = 2.0        # PGD step size
NUM_ITERS   = 60         # PGD iterations
SEED        = 42
torch.manual_seed(SEED); np.random.seed(SEED)

# ------------------------ Sanity / dependencies ------------------------------
required = ["model","X_t","A_t","y_t","test_mask_t","device"]
for r in required:
    if r not in globals():
        raise RuntimeError(f"Required variable '{r}' not found in the environment.")

model.to(device)
model.eval()

# ------------------------ Sample selection (100 per class) ------------------
test_idx_all = np.where(test_mask_t.cpu().numpy())[0]
labels_np = y_t.cpu().numpy()

rng = np.random.default_rng(SEED)
selected_nodes = []
for cls in [0,1]:
    idxs = [int(i) for i in test_idx_all if labels_np[i] == cls]
    if len(idxs) >= PER_CLASS:
        chosen = rng.choice(idxs, size=PER_CLASS, replace=False)
    else:
        chosen = idxs
    selected_nodes.extend(chosen)
selected_nodes = np.array(selected_nodes, dtype=np.int64)

print("Selected perturbation pool:", {0:int((labels_np[selected_nodes]==0).sum()), 1:int((labels_np[selected_nodes]==1).sum())})

# ------------------------ Compute J, spectral norm, top-right singular vec, FD error
per_sample_info = []   # tuples (idx, label, L_local, fd_rel_err)
print("\nComputing Jacobian / spectral vector and FD relative errors (200 nodes)...")
for node_idx in selected_nodes:
    node_idx = int(node_idx)
    x0 = X_t[node_idx].detach().clone().to(device).requires_grad_(True)

    def f_node(x):
        X_mod = X_t.clone().detach().to(device)
        X_mod[node_idx] = x
        logits = model(X_mod, A_t)
        return logits[node_idx]

    # compute Jacobian J (num_classes x d)
    J = torch.autograd.functional.jacobian(f_node, x0).detach()   # shape (c, d)
    # spectral norm and top-right singular vector (via SVD)
    try:
        U, S, Vh = torch.linalg.svd(J, full_matrices=False)
        sigma_max = S[0].item()
        v = Vh[0,:].detach()   # top right singular vector (length d)
    except RuntimeError:
        # fallback to power method on J@J.T for largest singular value/vector
        # compute JJT = J @ J.T (c x c), find principal left singular vector u, then v = J.T u / sigma
        Jcpu = J.cpu()
        JJT = (Jcpu @ Jcpu.T).numpy()
        eigvals, eigvecs = np.linalg.eigh(JJT)
        k = eigvals.argmax()
        u = torch.tensor(eigvecs[:,k], dtype=J.dtype, device=J.device)
        sigma_max = float(np.sqrt(max(eigvals[k], 0.0)))
        v = (J.T @ u)
        if v.norm().item() > 0:
            v = v / (v.norm() + 1e-12)

    # finite-difference relative error (single trial for speed; can average)
    delta_fd = FD_EPS * torch.randn_like(x0).to(device)
    pred_change = J.mv(delta_fd)
    f0 = f_node(x0).detach()
    f0p = f_node(x0 + delta_fd).detach()
    actual_change = f0p - f0
    fd_rel_err = (torch.norm(pred_change - actual_change) / (torch.norm(actual_change) + 1e-8)).item()

    per_sample_info.append((node_idx, int(labels_np[node_idx]), float(sigma_max), float(fd_rel_err)))

# Aggregate and print
clean_stats = [ (i,L,e) for (i,lab,L,e) in per_sample_info if lab==0 ]
troj_stats  = [ (i,L,e) for (i,lab,L,e) in per_sample_info if lab==1 ]

def aggs(stats):
    if not stats: return (0.0,0.0,0.0,0.0)
    Ls = np.array([s[1] for s in stats]); Es = np.array([s[2] for s in stats])
    return (Ls.mean(), Ls.std(), Es.mean(), Es.std())

cL_mean, cL_std, cE_mean, cE_std = aggs(clean_stats)
tL_mean, tL_std, tE_mean, tE_std = aggs(troj_stats)

print("\nAggregated spectral (Lipschitz) stats:")
print(f" Clean:  avg_L={cL_mean:.4f} ± {cL_std:.4f}, avg_FDrel={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_L={tL_mean:.4f} ± {tL_std:.4f}, avg_FDrel={tE_mean:.4e} ± {tE_std:.4e}")

print("\nSample preview (first 6): (idx,label,L,FD_rel_err)")
for p in per_sample_info[:6]:
    print(p)

# ------------------------ Create perturbed features (PGD initialized by top-singular vector) ------------------------
print("\nRunning per-node PGD (initialized on top singular vector) for each selected node ...")
perturbed_X = X_t.clone().detach().to(device)
flips = 0
orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()

for node_idx, label, sigma_val, fd_err in per_sample_info:
    node_idx = int(node_idx)
    x_orig = X_t[node_idx].detach().clone().to(device)
    # compute initialization direction v for current node (recompute so we have v on device)
    x0 = X_t[node_idx].detach().clone().to(device).requires_grad_(True)
    def f_local(x):
        X_mod = X_t.clone().detach().to(device)
        X_mod[node_idx] = x
        return model(X_mod, A_t)[node_idx]
    J = torch.autograd.functional.jacobian(f_local, x0).detach()
    # SVD for v (right singular)
    try:
        _, S, Vh = torch.linalg.svd(J, full_matrices=False)
        v_init = Vh[0,:].detach()
    except RuntimeError:
        # fallback random init
        v_init = torch.randn_like(x_orig).to(device)
    if v_init.norm().item() > 0:
        v_init = v_init / (v_init.norm() + 1e-12)
    else:
        v_init = torch.randn_like(x_orig).to(device)
        v_init = v_init / (v_init.norm() + 1e-12)

    # initialize adv example at x0 + (EPSILON/2) * v_init
    x_adv = (x_orig + 0.5 * EPSILON * v_init).detach().clone().requires_grad_(True)

    # PGD maximize CE loss wrt true label (untargeted)
    for it in range(NUM_ITERS):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x_adv
        logits = model(X_mod, A_t)
        loss = F.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))
        grad_x = torch.autograd.grad(loss, x_adv, retain_graph=False, create_graph=False)[0]
        if grad_x.norm().item() == 0:
            break
        # step in gradient direction (maximize)
        step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
        x_adv = (x_adv + step).detach()
        # project to L2-ball of radius EPSILON around x_orig
        delta = x_adv - x_orig
        dnorm = delta.norm().item()
        if dnorm > EPSILON:
            delta = delta * (EPSILON / (dnorm + 1e-12))
            x_adv = (x_orig + delta).detach()
        x_adv = x_adv.requires_grad_(True)

    # finalize
    x_adv_final = x_adv.detach()
    perturbed_X[node_idx] = x_adv_final

# ------------------------ Evaluate on full test set: 200 perturbed + rest original ----------
test_indices = np.where(test_mask_t.cpu().numpy())[0]
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all = logits_all[test_indices].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[test_indices]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n============= Robustness Evaluation (Full Test Set: 200 perturbed + rest original) =============")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}\n")
print("Classification report:")
print(classification_report(labels_all, preds_all, target_names=['clean','trojan'], digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# ------------------------ Flip statistics for selected nodes ---------------------------------------
with torch.no_grad():
    orig_sel_preds = orig_preds[selected_nodes]
    adv_sel_preds = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
num_flips = int((orig_sel_preds != adv_sel_preds).sum())
print(f"\nSelected nodes: {len(selected_nodes)}. Flipped after attack: {num_flips} ({100.0*num_flips/len(selected_nodes):.2f}%).")


Selected perturbation pool: {0: 100, 1: 100}

Computing Jacobian / spectral vector and FD relative errors (200 nodes)...

Aggregated spectral (Lipschitz) stats:
 Clean:  avg_L=0.5779 ± 0.1352, avg_FDrel=6.2905e-04 ± 8.0656e-04
 Trojan: avg_L=2.3569 ± 0.3636, avg_FDrel=8.7222e-03 ± 4.2885e-02

Sample preview (first 6): (idx,label,L,FD_rel_err)
(26119, 0, 0.5806882381439209, 0.00020766345551237464)
(57231, 0, 0.604617714881897, 0.0005543306469917297)
(40297, 0, 0.5674167275428772, 9.97965398710221e-05)
(27843, 0, 0.5931023955345154, 0.0003367722674738616)
(26863, 0, 0.6973091959953308, 0.0004411973350215703)
(46828, 0, 0.42770835757255554, 0.0001565459679113701)

Running per-node PGD (initialized on top singular vector) for each selected node ...

Accuracy: 99.38%
Precision: 0.9938, Recall: 0.9938, F1: 0.9938

Classification report:
              precision    recall  f1-score   support

       clean     0.9891    0.9862    0.9876      9159
      trojan     0.9954    0.9964    0.9959     

#### Hessian-based Curvature Analysis

In [5]:
# =========================
# Hessian-Based Curvature (grad outer-product) for node-level Trojan detection
# Uses: model, X_t, A_t, y_t, test_mask_t, device (must be defined already)
# =========================
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# -------------------- Parameters --------------------
PER_CLASS = 100          # 100 nodes per class (clean/trojan)
FD_EPS = 5e-3            # finite-diff epsilon for relative-error check
TRIALS_PER_NODE = 10     # average trials per node for relative error
PERT_P = 6.0             # L2 magnitude for final Hessian-aligned perturbation (tuneable)
SEED = 42

torch.manual_seed(SEED); np.random.seed(SEED)

# -------------------- Sanity checks --------------------
required = ["model","X_t","A_t","y_t","test_mask_t","device"]
for r in required:
    if r not in globals():
        raise RuntimeError(f"Required variable '{r}' not found in the environment.")

model.to(device)
model.eval()

# -------------------- Class names (binary) --------------------
class_names = ["clean", "trojan"]

# -------------------- Build test index list --------------------
test_idx_all = np.where(test_mask_t.cpu().numpy())[0]
labels_np = y_t.cpu().numpy()

rng = np.random.default_rng(SEED)
selected_nodes = []
for cls in [0,1]:
    idxs = [int(i) for i in test_idx_all if labels_np[i] == cls]
    if len(idxs) >= PER_CLASS:
        chosen = rng.choice(idxs, size=PER_CLASS, replace=False)
    else:
        chosen = idxs
    selected_nodes.extend(chosen)
selected_nodes = np.array(selected_nodes, dtype=np.int64)

print(f"Selected perturbation pool: clean={int((labels_np[selected_nodes]==0).sum())}, trojan={int((labels_np[selected_nodes]==1).sum())}")

# -------------------- Precompute base logits (to find predicted class at x0) --------------------
with torch.no_grad():
    base_logits_all = model(X_t, A_t).detach()  # shape (N, C)

# -------------------- Storage --------------------
per_sample_info = []   # (node_idx, label, lambda_max = ||g||^2, avg_rel_error)
print("\nComputing gradient norms and FD relative errors for selected nodes...")

# -------------------- Helper: h(x) and gradient computation --------------------
def compute_g_and_h_for_node(node_idx):
    # x0 with grad
    x0 = X_t[node_idx].detach().clone().to(device).requires_grad_(True)
    pred_class = int(torch.argmax(base_logits_all[node_idx]).item())

    def h(x):
        X_mod = X_t.clone().detach().to(device)
        X_mod[node_idx] = x
        logits = model(X_mod, A_t)[node_idx]
        logp = F.log_softmax(logits, dim=0)
        return logp[pred_class]

    h0 = h(x0)
    g = torch.autograd.grad(h0, x0, retain_graph=False, create_graph=False)[0].detach()
    return x0.detach(), g, h0.detach(), h

# -------------------- Main loop: compute ||g||^2 and relative errors --------------------
for node_idx in selected_nodes:
    node_idx = int(node_idx)
    label = int(labels_np[node_idx])
    x0, g, h0, h_func = compute_g_and_h_for_node(node_idx)

    lambda_max = float(g.norm(p=2).item() ** 2)   # curvature proxy

    # Relative error via multiple random deltas
    node_rel_errs = []
    for _ in range(TRIALS_PER_NODE):
        delta = FD_EPS * torch.randn_like(x0).to(device)
        gt_delta = float(torch.dot(g, delta).item())
        pred_second = 0.5 * (gt_delta ** 2)
        actual_second = float((h_func(x0 + delta) - h0 - torch.dot(g, delta)).item())
        rel_error = abs(pred_second - actual_second) / (abs(actual_second) + 1e-8)
        node_rel_errs.append(rel_error)

    avg_rel_err = float(np.mean(node_rel_errs))
    per_sample_info.append((node_idx, label, lambda_max, avg_rel_err))

# -------------------- Aggregation & Print --------------------
clean_stats = [t for t in per_sample_info if t[1]==0]
troj_stats  = [t for t in per_sample_info if t[1]==1]

def summarize(stats):
    if not stats:
        return (0.0,0.0,0.0,0.0)
    Ls = np.array([s[2] for s in stats])
    Es = np.array([s[3] for s in stats])
    return (Ls.mean(), Ls.std(), Es.mean(), Es.std())

cL_mean, cL_std, cE_mean, cE_std = summarize(clean_stats)
tL_mean, tL_std, tE_mean, tE_std = summarize(troj_stats)
overall_L_mean = np.mean([s[2] for s in per_sample_info]) if per_sample_info else 0.0
overall_L_std  = np.std([s[2] for s in per_sample_info])  if per_sample_info else 0.0
overall_E_mean = np.mean([s[3] for s in per_sample_info]) if per_sample_info else 0.0
overall_E_std  = np.std([s[3] for s in per_sample_info])  if per_sample_info else 0.0

print("\nAggregated Hessian (grad outer-product) stats:")
print(f" Clean:  avg_lambda={cL_mean:.4f} ± {cL_std:.4f}, avg_FDrel={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_lambda={tL_mean:.4f} ± {tL_std:.4f}, avg_FDrel={tE_mean:.4e} ± {tE_std:.4e}")
print(f" Overall: avg_lambda={overall_L_mean:.4f} ± {overall_L_std:.4f}, avg_FDrel={overall_E_mean:.4e} ± {overall_E_std:.4e}")

print("\nSample preview (first 6): (idx,label,lambda,FD_rel_err)")
for p in per_sample_info[:6]:
    print(p)

# -------------------- Build adversarial perturbations aligned to -g (reduce log-prob of predicted class) --------------------
print("\nConstructing Hessian-aligned perturbations (direction = -g normalized) and applying to selected nodes...")
perturbed_X = X_t.clone().detach().to(device)
orig_preds = torch.argmax(base_logits_all, dim=1).cpu().numpy()

for (node_idx, label, lambda_val, avg_rel_err) in per_sample_info:
    node_idx = int(node_idx)
    # recompute g to be safe (cheap relative to full loop); reuse compute helper
    x0, g, h0, h_func = compute_g_and_h_for_node(node_idx)
    gnorm = g.norm().item()
    if gnorm < 1e-12:
        # if gradient is essentially zero, use random direction
        dir_vec = torch.randn_like(x0).to(device)
    else:
        dir_vec = - g / (gnorm + 1e-12)  # negative g to reduce log-prob

    # scale to desired L2 magnitude PERT_P
    delta = (PERT_P * dir_vec).detach()
    perturbed_X[node_idx] = (x0 + delta).detach()

# -------------------- Evaluate on full test set: (200 perturbed + rest original) --------------------
test_indices = np.where(test_mask_t.cpu().numpy())[0]
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all = logits_all[test_indices].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[test_indices]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n================ Robustness Evaluation (Full Test Set: 200 perturbed + rest original) ===============")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}\n")
print("Classification report:")
print(classification_report(labels_all, preds_all, target_names=class_names, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# -------------------- Flip statistics --------------------
orig_sel_preds = orig_preds[selected_nodes]
adv_sel_preds = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
num_flips = int((orig_sel_preds != adv_sel_preds).sum())
print(f"\nSelected nodes: {len(selected_nodes)}. Flipped after Hessian-based perturbation: {num_flips} ({100.0*num_flips/len(selected_nodes):.2f}%).")


Selected perturbation pool: clean=100, trojan=100

Computing gradient norms and FD relative errors for selected nodes...

Aggregated Hessian (grad outer-product) stats:
 Clean:  avg_lambda=0.0010 ± 0.0027, avg_FDrel=8.8848e-01 ± 7.6424e-02
 Trojan: avg_lambda=0.0142 ± 0.0434, avg_FDrel=9.8424e-01 ± 8.0939e-02
 Overall: avg_lambda=0.0076 ± 0.0315, avg_FDrel=9.3636e-01 ± 9.2134e-02

Sample preview (first 6): (idx,label,lambda,FD_rel_err)
(26119, 0, 0.00013114006249938335, 0.8717354198141685)
(57231, 0, 0.00010764707143384126, 0.8226354340519129)
(40297, 0, 0.001321573346424812, 0.9325460845060217)
(27843, 0, 0.0007104109633964684, 0.9141915288341323)
(26863, 0, 0.0005748949339887677, 0.8405098988944559)
(46828, 0, 0.0004032530106434819, 0.8759966817844393)

Constructing Hessian-aligned perturbations (direction = -g normalized) and applying to selected nodes...

Accuracy: 99.55%
Precision: 0.9955, Recall: 0.9955, F1: 0.9955

Classification report:
              precision    recall  f1-sco

#### Prediction Margin

In [8]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ------------------------
# Parameters
# ------------------------
EPSILON = 2.0      # stronger perturbation budget
ALPHA = 0.4        # PGD step size
NUM_ITERS = 15     # PGD iterations
FD_EPS = 1e-3      # small noise for finite-difference check

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------
# Step 1: Select nodes (100 clean, 100 trojan)
# ------------------------
clean_nodes = np.where(labels_np == 0)[0]
trojan_nodes = np.where(labels_np == 1)[0]
np.random.seed(42)
sel_clean = np.random.choice(clean_nodes, 100, replace=False)
sel_trojan = np.random.choice(trojan_nodes, 100, replace=False)
selected_nodes = np.concatenate([sel_clean, sel_trojan])

print(f"\n--- Strong Prediction-Margin Attack (200 nodes: 100 per class) ---")
print(f"Selected nodes ? clean={len(sel_clean)}, trojan={len(sel_trojan)}")

# ------------------------
# Step 2: Apply perturbation (PGD on all features)
# ------------------------
perturbed_X = X_t.clone().detach().to(device)
orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()

for node_idx in selected_nodes:
    node_idx = int(node_idx)
    x_orig = X_t[node_idx].detach().clone().to(device)

    # adversarial initialization
    delta = torch.randn_like(x_orig).to(device)
    delta = EPSILON * delta / (delta.norm() + 1e-12)
    x_adv = (x_orig + delta).detach().clone().requires_grad_(True)

    for it in range(NUM_ITERS):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x_adv
        logits = model(X_mod, A_t)
        loss = F.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))
        grad_x = torch.autograd.grad(loss, x_adv, retain_graph=False, create_graph=False)[0]

        step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
        x_adv = (x_adv + step).detach()
        delta = x_adv - x_orig
        if delta.norm() > EPSILON:
            delta = delta * (EPSILON / (delta.norm() + 1e-12))
            x_adv = (x_orig + delta).detach()
        x_adv = x_adv.requires_grad_(True)

    perturbed_X[node_idx] = x_adv.detach()

print("? Finished perturbations.")

# ------------------------
# Step 3: Evaluate model on perturbed + original mix
# ------------------------
test_indices = np.where(test_mask_t.cpu().numpy())[0]
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all = logits_all[test_indices].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[test_indices]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n============= Robustness Evaluation (Full Test Set) =============")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
print("\nClassification report:")
print(classification_report(labels_all, preds_all, target_names=['clean','trojan'], digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# Flip statistics
with torch.no_grad():
    adv_sel_preds = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
num_flips = int((orig_preds[selected_nodes] != adv_sel_preds).sum())
print(f"\nSelected nodes: {len(selected_nodes)}. Flipped after attack: {num_flips} ({100.0*num_flips/len(selected_nodes):.2f}%).")

# ------------------------
# Step 4: Compute Prediction Margin + FD relative error on perturbed nodes
# ------------------------
per_sample_info = []
for node_idx in selected_nodes:
    node_idx = int(node_idx)
    with torch.no_grad():
        logits = model(perturbed_X, A_t)[node_idx]
    pred_class = logits.argmax().item()
    pred_logit = logits[pred_class].item()
    other_logits = logits.clone()
    other_logits[pred_class] = -float('inf')
    second_max = other_logits.max().item()
    margin = pred_logit - second_max

    # finite-difference perturbation check
    delta = FD_EPS * torch.randn_like(perturbed_X[node_idx]).to(device)
    X_mod = perturbed_X.clone().detach()
    X_mod[node_idx] = perturbed_X[node_idx] + delta
    with torch.no_grad():
        logits_p = model(X_mod, A_t)[node_idx]
    pred_logit_p = logits_p[pred_class].item()
    other_logits_p = logits_p.clone()
    other_logits_p[pred_class] = -float('inf')
    second_max_p = other_logits_p.max().item()
    margin_p = pred_logit_p - second_max_p

    rel_err = abs(margin - margin_p) / (abs(margin_p) + 1e-12)
    per_sample_info.append((node_idx, int(labels_np[node_idx]), float(margin), float(rel_err)))

# aggregate stats
clean_stats = [(i,m,e) for (i,lab,m,e) in per_sample_info if lab==0]
troj_stats  = [(i,m,e) for (i,lab,m,e) in per_sample_info if lab==1]

def aggs(stats):
    Ms = np.array([s[1] for s in stats]); Es = np.array([s[2] for s in stats])
    return (Ms.mean(), Ms.std(), Es.mean(), Es.std())

cM_mean,cM_std,cE_mean,cE_std = aggs(clean_stats)
tM_mean,tM_std,tE_mean,tE_std = aggs(troj_stats)

print("\n--- Prediction Margin Stats (on perturbed nodes) ---")
print(f" Clean:  avg_margin={cM_mean:.4f} ± {cM_std:.4f}, avg_FDrel={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_margin={tM_mean:.4f} ± {tM_std:.4f}, avg_FDrel={tE_mean:.4e} ± {tE_std:.4e}")
print("\nSample preview (first 6): (idx,label,margin,FD_rel_err)")
for p in per_sample_info[:6]:
    print(p)



--- Strong Prediction-Margin Attack (200 nodes: 100 per class) ---
Selected nodes ? clean=100, trojan=100
? Finished perturbations.

Accuracy: 100.00%
Precision: 1.0000, Recall: 1.0000, F1: 1.0000

Classification report:
              precision    recall  f1-score   support

       clean     1.0000    1.0000    1.0000      9159
      trojan     1.0000    1.0000    1.0000     27556

    accuracy                         1.0000     36715
   macro avg     1.0000    1.0000    1.0000     36715
weighted avg     1.0000    1.0000    1.0000     36715

Confusion Matrix:
[[ 9159     0]
 [    0 27556]]

Selected nodes: 200. Flipped after attack: 101 (50.50%).

--- Prediction Margin Stats (on perturbed nodes) ---
 Clean:  avg_margin=2.0527 ± 0.6118, avg_FDrel=3.9362e-04 ± 5.7933e-04
 Trojan: avg_margin=3.7545 ± 0.6768, avg_FDrel=9.3533e-04 ± 9.7417e-04

Sample preview (first 6): (idx,label,margin,FD_rel_err)
(44087, 0, 1.2382581233978271, 0.0001136840236789138)
(1246, 0, 2.1299312114715576, 0.00030

#### Adversarial Robustness Radius

In [10]:
# === Adversarial Robustness Radius (ARR) - notebook-friendly, uses existing variables ===
import torch
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import time

# ------------------ Safety / fallback checks ------------------
# Required variables we expect from your prior runs
required = ["model", "X_t", "A_t", "y_t", "test_mask_t", "device"]
missing = [r for r in required if r not in globals()]
if missing:
    raise RuntimeError(f"Required variables missing from notebook environment: {missing}\n"
                       "Make sure you've executed the training/evaluation cells that define model, X_t, A_t, y_t, test_mask_t, device.")

# labels_np fallback
if "labels_np" not in globals():
    labels_np = y_t.cpu().numpy()
else:
    labels_np = globals()["labels_np"]

# Ensure test indices
if isinstance(test_mask_t, torch.Tensor):
    test_indices = np.where(test_mask_t.cpu().numpy())[0]
else:
    # if test_mask_t not boolean tensor but array
    test_indices = np.where(np.array(test_mask_t))[0]

# small helper
to_device = lambda t: t.to(device) if isinstance(t, torch.Tensor) else t

# ------------------ Sampling (ensure selected_nodes exists) ------------------
PER_CLASS = 100
rng = np.random.default_rng(42)

if "selected_nodes" in globals():
    selected_nodes = np.array(globals()["selected_nodes"], dtype=np.int64)
else:
    # pick 100 per class from test set (clean=0, trojan=1)
    clean_idxs = [int(i) for i in test_indices if labels_np[int(i)] == 0]
    trojan_idxs = [int(i) for i in test_indices if labels_np[int(i)] == 1]
    sel_clean = rng.choice(clean_idxs, size=min(PER_CLASS, len(clean_idxs)), replace=False).tolist()
    sel_trojan = rng.choice(trojan_idxs, size=min(PER_CLASS, len(trojan_idxs)), replace=False).tolist()
    selected_nodes = np.array(sel_clean + sel_trojan, dtype=np.int64)

print(f"Selected nodes: clean={int((labels_np[selected_nodes]==0).sum())}, trojan={int((labels_np[selected_nodes]==1).sum())}")

# ------------------ Perturbed features (ensure perturbed_X exists) ------------------
# If you already have perturbed_X from prior PGD, reuse it; otherwise construct PGD perturbations.
EPSILON = globals().get("EPSILON", 2.0)     # L2 radius used previously (match your PGD)
ALPHA   = globals().get("ALPHA", 0.4)       # step size used previously
NUM_ITERS = globals().get("NUM_ITERS", 15)  # PGD iters

if "perturbed_X" in globals():
    perturbed_X = globals()["perturbed_X"].clone().detach().to(device)
    print("Re-using existing perturbed_X from notebook.")
else:
    print("No existing perturbed_X found  creating PGD perturbations (this may take some time)...")
    perturbed_X = X_t.clone().detach().to(device)
    orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()
    for node_idx in selected_nodes:
        node_idx = int(node_idx)
        x_orig = X_t[node_idx].detach().clone().to(device)
        # initialize with random L2 perturbation on the whole feature vector
        delta = torch.randn_like(x_orig).to(device)
        delta = EPSILON * delta / (delta.norm() + 1e-12)
        x_adv = (x_orig + delta).detach().clone().requires_grad_(True)

        for it in range(NUM_ITERS):
            X_mod = perturbed_X.clone().detach()
            X_mod[node_idx] = x_adv
            logits = model(X_mod, A_t)
            loss = torch.nn.functional.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))
            grad_x = torch.autograd.grad(loss, x_adv, retain_graph=False, create_graph=False)[0]
            if grad_x.norm().item() == 0:
                break
            step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
            x_adv = (x_adv + step).detach()
            delta = x_adv - x_orig
            dnorm = delta.norm().item()
            if dnorm > EPSILON:
                delta = delta * (EPSILON / (dnorm + 1e-12))
                x_adv = (x_orig + delta).detach()
            x_adv = x_adv.requires_grad_(True)

        perturbed_X[node_idx] = x_adv.detach()
    print("Finished creating perturbed_X via PGD.")

# ------------------ Evaluate model on full test set (200 perturbed + rest original) ------------------
model.eval()
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all = logits_all[test_indices].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[test_indices]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n============= Robustness Evaluation (Full Test Set: perturbed selected nodes + others unperturbed) =============")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}\n")
print("Classification report:")
print(classification_report(labels_all, preds_all, target_names=['clean','trojan'], digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# Flip statistics for selected nodes
with torch.no_grad():
    orig_preds = model(X_t.to(device), A_t).argmax(dim=1).cpu().numpy()
    adv_sel_preds = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
num_flips = int((orig_preds[selected_nodes] != adv_sel_preds).sum())
print(f"\nSelected nodes: {len(selected_nodes)}. Flipped after perturbation: {num_flips} ({100.0 * num_flips/len(selected_nodes):.2f}%).")

# ------------------ ARR computation helpers (operate on perturbed_X!) ------------------
def f_for_sample(x_tensor, test_idx):
    """Return logits vector for node test_idx when its features are replaced by x_tensor.
       Uses adjacency A_t and perturbed_X baseline to preserve the same graph context."""
    X_mod = perturbed_X.clone().detach()
    X_mod[test_idx] = x_tensor
    with torch.no_grad():
        out = model(X_mod, A_t)
    return out[test_idx]

def adversarial_radius_for_sample(test_idx, initial_epsilon=1e-3, growth_factor=1.25,
                                  max_epsilon=20.0, bs_iters=10, num_trials=8):
    """Estimate minimal perturbation norm (L2) that flips the model prediction,
       measured *around the perturbed point* (perturbed_X)."""
    x0 = perturbed_X[test_idx].clone().detach().to(device)
    # base predicted label at this perturbed point
    with torch.no_grad():
        base_out = model(perturbed_X, A_t)
        y0 = int(torch.argmax(base_out[test_idx]).item())

    def is_same(x):
        out = f_for_sample(x, test_idx)
        return int(torch.argmax(out).item()) == y0

    radii = []
    for _ in range(num_trials):
        d = torch.randn_like(x0)
        d = d / (d.norm() + 1e-12)

        eps = initial_epsilon
        # expand until flip or cap
        while eps < max_epsilon and is_same(x0 + eps * d):
            eps *= growth_factor

        if eps >= max_epsilon:
            candidate = max_epsilon
        else:
            low, high = eps / growth_factor, eps
            for _ in range(bs_iters):
                mid = 0.5 * (low + high)
                if is_same(x0 + mid * d):
                    low = mid
                else:
                    high = mid
            candidate = float(high)
        radii.append(candidate)

    return float(min(radii))

def adversarial_radius_relerr(test_idx):
    r1 = adversarial_radius_for_sample(test_idx, growth_factor=1.25, bs_iters=10, num_trials=6)
    r2 = adversarial_radius_for_sample(test_idx, growth_factor=1.4, bs_iters=12, num_trials=6)
    rel_err = abs(r1 - r2) / (abs(r2) + 1e-12)
    return r1, rel_err

# ------------------ Compute ARR on selected perturbed nodes ------------------
class_names = ['clean', 'trojan']
class_adv_radius = {cn: [] for cn in class_names}
class_rel_errors = {cn: [] for cn in class_names}
all_radii, all_rel_errs = [], []

t0 = time.time()
print("\nComputing Adversarial Robustness Radius for selected perturbed nodes (this is the expensive part)...")
for i, node_idx in enumerate(selected_nodes):
    node_idx = int(node_idx)
    label = int(labels_np[node_idx])
    cn = class_names[label]
    r, rel = adversarial_radius_relerr(node_idx)
    class_adv_radius[cn].append(r)
    class_rel_errors[cn].append(rel)
    all_radii.append(r)
    all_rel_errs.append(rel)
    if (i+1) % 20 == 0:
        print(f"  processed {i+1}/{len(selected_nodes)} nodes...")

t1 = time.time()
print(f"Done ARR computation. Time elapsed: {t1-t0:.1f}s")

# ------------------ Reporting ARR aggregates ------------------
print("\nClass-wise ARR (with relative error):")
print("{:<10s} {:>14s} {:>22s}".format("Class", "Avg Radius ± Std", "Avg Rel. Error ± Std"))
print("-"*52)
for cn in class_names:
    if class_adv_radius[cn]:
        print("{:<10s} {:>7.4f} ± {:<7.4f} {:>14.4e} ± {:<10.4e}".format(
            cn, np.mean(class_adv_radius[cn]), np.std(class_adv_radius[cn]),
            np.mean(class_rel_errors[cn]), np.std(class_rel_errors[cn])
        ))
    else:
        print("{:<10s} {:>10s}".format(cn, "-"))

print("\nOverall ARR: Avg Radius: {:.4f} ± {:.4f}".format(np.mean(all_radii), np.std(all_radii)))
print("Overall ARR: Avg Relative Error: {:.4e} ± {:.4e}".format(np.mean(all_rel_errs), np.std(all_rel_errs)))


Selected nodes: clean=100, trojan=100
Re-using existing perturbed_X from notebook.

Accuracy: 100.00%
Precision: 1.0000, Recall: 1.0000, F1: 1.0000

Classification report:
              precision    recall  f1-score   support

       clean     1.0000    1.0000    1.0000      9159
      trojan     1.0000    1.0000    1.0000     27556

    accuracy                         1.0000     36715
   macro avg     1.0000    1.0000    1.0000     36715
weighted avg     1.0000    1.0000    1.0000     36715

Confusion Matrix:
[[ 9159     0]
 [    0 27556]]

Selected nodes: 200. Flipped after perturbation: 101 (50.50%).

Computing Adversarial Robustness Radius for selected perturbed nodes (this is the expensive part)...
  processed 20/200 nodes...
  processed 40/200 nodes...
  processed 60/200 nodes...
  processed 80/200 nodes...
  processed 100/200 nodes...
  processed 120/200 nodes...
  processed 140/200 nodes...
  processed 160/200 nodes...
  processed 180/200 nodes...
  processed 200/200 nodes...


#### Stability Under Input Noise

In [12]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ========================
# Parameters
# ========================
# PGD (to create the evaluation perturbations)
EPSILON     = 2.0     # L2 budget
ALPHA       = 0.4     # step size
NUM_ITERS   = 15      # iterations

# Stability metric (computed AFTER perturbations, around the perturbed point)
NOISE_SIGMA        = 0.05   # Gaussian noise stddev for stability metric
NUM_NOISE_SAMPLES  = 20     # Monte Carlo samples per node
RELERR_RESAMPLES   = 5      # re-estimate stability this many times for relative error

# ========================
# Step 1: Select nodes (100 clean, 100 trojan)
# ========================
clean_nodes  = np.where(labels_np == 0)[0]
trojan_nodes = np.where(labels_np == 1)[0]
np.random.seed(42)
sel_clean   = np.random.choice(clean_nodes,  100, replace=False)
sel_trojan  = np.random.choice(trojan_nodes, 100, replace=False)
selected_nodes = np.concatenate([sel_clean, sel_trojan])

print("\n--- Stability Under Input Noise (PGD-first, then metric) ---")
print(f"Selected nodes ? clean={len(sel_clean)}, trojan={len(sel_trojan)}")

# ========================
# Step 2: Apply perturbation (PGD on ALL features of selected nodes)
# ========================
perturbed_X = X_t.clone().detach().to(device)
with torch.no_grad():
    orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()

for node_idx in selected_nodes:
    node_idx = int(node_idx)
    x_orig = X_t[node_idx].detach().clone().to(device)

    # random init within L2-ball
    delta = torch.randn_like(x_orig).to(device)
    delta = EPSILON * delta / (delta.norm() + 1e-12)
    x_adv = (x_orig + delta).detach().clone().requires_grad_(True)

    for _ in range(NUM_ITERS):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x_adv
        logits = model(X_mod, A_t)
        loss = F.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))

        grad_x = torch.autograd.grad(loss, x_adv, retain_graph=False, create_graph=False)[0]
        step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
        x_adv = (x_adv + step).detach()

        # project back to L2 ball
        delta = x_adv - x_orig
        dnorm = delta.norm()
        if dnorm > EPSILON:
            delta = delta * (EPSILON / (dnorm + 1e-12))
            x_adv = (x_orig + delta).detach()
        x_adv = x_adv.requires_grad_(True)

    perturbed_X[node_idx] = x_adv.detach()

print("? Finished perturbations.")

# ========================
# Step 3: Robustness evaluation on FULL test set (200 perturbed + rest original)
# ========================
test_indices = np.where(test_mask_t.cpu().numpy())[0]
with torch.no_grad():
    logits_all = model(perturbed_X, A_t)
    preds_all  = logits_all[test_indices].argmax(dim=1).cpu().numpy()
    labels_all = labels_np[test_indices]

acc = (preds_all == labels_all).mean()
prec, rec, f1, _ = precision_recall_fscore_support(labels_all, preds_all, average='weighted', zero_division=0)

print("\n============= Robustness Evaluation (Full Test Set) =============")
print(f"Accuracy: {acc*100:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}\n")
print("Classification report:")
print(classification_report(labels_all, preds_all, target_names=['clean','trojan'], digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels_all, preds_all, labels=[0,1]))

# Flip statistics on selected nodes only (for quick sanity)
with torch.no_grad():
    adv_sel_preds = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
num_flips = int((orig_preds[selected_nodes] != adv_sel_preds).sum())
print(f"\nSelected nodes: {len(selected_nodes)}. Flipped after attack: {num_flips} ({100.0*num_flips/len(selected_nodes):.2f}%).")

# ========================
# Step 4: Stability Under Input Noise (computed AFTER perturbations)
#         Measures avg L2 change in logits when adding Gaussian noise
#         around the perturbed feature vector.
# ========================
def stability_for_node(node_idx, sigma, num_samples):
    """
    Average L2 change in logits between (perturbed_X baseline) and
    noisy versions (perturbed_X + noise) for this node.
    """
    node_idx = int(node_idx)
    with torch.no_grad():
        f_orig = logits_all[node_idx]  # baseline logits at the perturbed point (already computed)

    diffs = []
    for _ in range(num_samples):
        noise = sigma * torch.randn_like(perturbed_X[node_idx]).to(device)
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = perturbed_X[node_idx] + noise
        with torch.no_grad():
            f_noisy = model(X_mod, A_t)[node_idx]
        diffs.append(torch.norm(f_noisy - f_orig).item())
    return float(np.mean(diffs))

# Compute stability and relative error on perturbed nodes
per_sample_info = []  # (idx, label, stability, rel_err)
for node_idx in selected_nodes:
    s_val = stability_for_node(node_idx, NOISE_SIGMA, NUM_NOISE_SAMPLES)
    # relative error: re-estimate a few times and compare
    re_vals = [stability_for_node(node_idx, NOISE_SIGMA, NUM_NOISE_SAMPLES) for _ in range(RELERR_RESAMPLES)]
    s_ref = float(np.mean(re_vals))
    rel_err = abs(s_val - s_ref) / (abs(s_ref) + 1e-12)
    per_sample_info.append((int(node_idx), int(labels_np[int(node_idx)]), float(s_val), float(rel_err)))

# ========================
# Step 5: Aggregate and report metric stats
# ========================
clean_stats = [(i,s,e) for (i,lab,s,e) in per_sample_info if lab==0]
troj_stats  = [(i,s,e) for (i,lab,s,e) in per_sample_info if lab==1]

def aggs(stats):
    if not stats: return (0.0,0.0,0.0,0.0)
    Ss = np.array([s for (_,s,_) in [(i,v,e) for (i,v,e) in [(i,s,e) for (i,_,s,e) in stats]]])  # robust indexing
    Es = np.array([e for (_,_,e) in stats])
    # The above line is overly defensive; simpler:
    Ss = np.array([s for (_,s,_) in [(i,s,e) for (i,_,s,e) in stats]])
    Es = np.array([e for (_,_,e) in [(i,s,e) for (i,_,s,e) in stats]])
    return (Ss.mean(), Ss.std(), Es.mean(), Es.std())

# (Fix the helper to be clean & simple)
def aggs(stats):
    if not stats: return (0.0,0.0,0.0,0.0)
    Ss = np.array([s for (_,_,s,_) in [(i,lab,s,e) for (i,lab,s,e) in stats]])
    Es = np.array([e for (_,_,_,e) in [(i,lab,s,e) for (i,lab,s,e) in stats]])
    return (Ss.mean(), Ss.std(), Es.mean(), Es.std())

cS_mean, cS_std, cE_mean, cE_std = aggs(clean_stats)
tS_mean, tS_std, tE_mean, tE_std = aggs(troj_stats)

print("\n--- Stability Under Input Noise (on perturbed nodes) ---")
print(f" Clean:  avg_stability={cS_mean:.4f} ± {cS_std:.4f}, avg_relerr={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_stability={tS_mean:.4f} ± {tS_std:.4f}, avg_relerr={tE_mean:.4e} ± {tE_std:.4e}")

print("\nSample preview (first 6): (idx,label,stability,rel_err)")
for p in per_sample_info[:6]:
    print(p)



--- Stability Under Input Noise (PGD-first, then metric) ---
Selected nodes ? clean=100, trojan=100
? Finished perturbations.

Accuracy: 100.00%
Precision: 1.0000, Recall: 1.0000, F1: 1.0000

Classification report:
              precision    recall  f1-score   support

       clean     1.0000    1.0000    1.0000      9159
      trojan     1.0000    1.0000    1.0000     27556

    accuracy                         1.0000     36715
   macro avg     1.0000    1.0000    1.0000     36715
weighted avg     1.0000    1.0000    1.0000     36715

Confusion Matrix:
[[ 9159     0]
 [    0 27556]]

Selected nodes: 200. Flipped after attack: 101 (50.50%).


ValueError: not enough values to unpack (expected 4, got 3)

In [14]:
# ========================
# Step 5: Aggregate and report metric stats (FIXED for 3-tuples)
# ========================
def aggs(stats):
    if not stats:
        return (0.0, 0.0, 0.0)
    Ss = np.array([s for (_, s, _) in stats])  # stability values
    Es = np.array([e for (_, _, e) in stats])  # relative errors
    return (Ss.mean(), Ss.std(), Es.mean(), Es.std())

cS_mean, cS_std, cE_mean, cE_std = aggs(clean_stats)
tS_mean, tS_std, tE_mean, tE_std = aggs(troj_stats)

print("\n--- Stability Under Input Noise (on perturbed nodes) ---")
print(f" Clean:  avg_stability={cS_mean:.4f} ± {cS_std:.4f}, avg_relerr={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_stability={tS_mean:.4f} ± {tS_std:.4f}, avg_relerr={tE_mean:.4e} ± {tE_std:.4e}")

print("\nSample preview (first 6): (idx,stability,rel_err)")
for p in per_sample_info[:6]:
    print(p)



--- Stability Under Input Noise (on perturbed nodes) ---
 Clean:  avg_stability=0.0257 ± 0.0095, avg_relerr=1.6208e-01 ± 1.1808e-01
 Trojan: avg_stability=0.1172 ± 0.0236, avg_relerr=1.3512e-01 ± 9.6998e-02

Sample preview (first 6): (idx,stability,rel_err)
(44087, 0, 0.015042922904831358, 0.04401712210750706)
(1246, 0, 0.022505388408899308, 0.2220340311144904)
(55526, 0, 0.02417376406956464, 0.22841417608479572)
(3698, 0, 0.021043947315774858, 0.03313790320565221)
(23454, 0, 0.02797463204478845, 0.12765440160094868)
(58015, 0, 0.03421499626711011, 0.27192676340429556)


#### All in One, same perturbation across all metric.

In [2]:
# ================================
# Unified Robustness Evaluation
# ================================
import torch, numpy as np
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ---------------- Parameters ----------------
PER_CLASS = 100
EPSILON   = 5.0     # L2 budget for PGD
ALPHA     = 1.0     # PGD step size
NUM_ITERS = 40      # PGD iterations
FD_EPS    = 1e-3    # finite-difference epsilon
SEED      = 42

torch.manual_seed(SEED); np.random.seed(SEED)

required_vars = ["model","X_t","A_t","y_t","test_mask_t","device"]
for v in required_vars:
    if v not in globals():
        raise RuntimeError(f"Required var '{v}' not found.")

model.to(device); model.eval()
labels_np = y_t.cpu().numpy()

# ---------------- Node Selection ----------------
test_indices = np.where(test_mask_t.cpu().numpy())[0]
rng = np.random.default_rng(SEED)
selected_nodes = []
for cls in [0,1]:
    idxs = [int(i) for i in test_indices if labels_np[i]==cls]
    chosen = rng.choice(idxs, size=min(PER_CLASS, len(idxs)), replace=False)
    selected_nodes.extend(chosen)
selected_nodes = np.array(selected_nodes, dtype=np.int64)

print(f"Selected: clean={int((labels_np[selected_nodes]==0).sum())}, "
      f"trojan={int((labels_np[selected_nodes]==1).sum())}")

# ---------------- Shared PGD Perturbations ----------------
perturbed_X = X_t.clone().detach().to(device)
for node_idx in selected_nodes:
    node_idx = int(node_idx)
    x_orig = X_t[node_idx].detach().clone().to(device)
    x_adv = (x_orig + 1e-3*torch.randn_like(x_orig)).detach().requires_grad_(True)

    for _ in range(NUM_ITERS):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x_adv
        logits = model(X_mod, A_t)
        loss = F.cross_entropy(logits[node_idx].unsqueeze(0), y_t[node_idx].unsqueeze(0))
        grad_x = torch.autograd.grad(loss, x_adv)[0]
        if grad_x.norm().item()==0: break
        step = ALPHA * grad_x / (grad_x.norm() + 1e-12)
        x_adv = (x_adv + step).detach()
        delta = x_adv - x_orig
        if delta.norm() > EPSILON:
            delta = delta * (EPSILON/(delta.norm()+1e-12))
            x_adv = (x_orig + delta).detach()
        x_adv = x_adv.requires_grad_(True)
    perturbed_X[node_idx] = x_adv.detach()
print("? Shared PGD perturbations done.")

# ---------------- Eval Helper ----------------
def evaluate_model(name, perturbed_X, selected_nodes):
    test_idx = np.where(test_mask_t.cpu().numpy())[0]
    with torch.no_grad():
        logits_all = model(perturbed_X, A_t)
        preds_all = logits_all[test_idx].argmax(dim=1).cpu().numpy()
        labels_all = labels_np[test_idx]

    acc = (preds_all == labels_all).mean()
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels_all, preds_all, average="weighted", zero_division=0)

    print(f"\n=== Robustness Eval ({name}) ===")
    print(f"Accuracy={acc*100:.2f} | Precision={prec:.4f} | Recall={rec:.4f} | F1={f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(labels_all, preds_all, labels=[0,1]))
    print("Classification Report:")
    print(classification_report(labels_all, preds_all, target_names=["clean","trojan"], digits=4))

    # Flip stats on selected only
    with torch.no_grad():
        orig_preds = model(X_t, A_t).argmax(dim=1).cpu().numpy()
        adv_preds  = logits_all[selected_nodes].argmax(dim=1).cpu().numpy()
    flips = (orig_preds[selected_nodes] != adv_preds).sum()
    print(f"Flipped {flips}/{len(selected_nodes)} ({100*flips/len(selected_nodes):.2f}%)")
    return logits_all

# ---------------- Metric 1: Jacobian Sensitivity ----------------
jac_info = []
for node_idx in selected_nodes:
    x0 = perturbed_X[node_idx].detach().clone().requires_grad_(True)
    def f_local(x):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x
        return model(X_mod, A_t)[node_idx]
    J = torch.autograd.functional.jacobian(f_local, x0)
    jac_norm = torch.norm(J, p='fro').item()
    delta_fd = FD_EPS * torch.randn_like(x0)
    pred_change = J.mv(delta_fd)
    f0, f0p = f_local(x0).detach(), f_local(x0+delta_fd).detach()
    actual_change = f0p - f0
    rel_err = (torch.norm(pred_change-actual_change)/(torch.norm(actual_change)+1e-8)).item()
    jac_info.append((int(labels_np[node_idx]), jac_norm, rel_err))
print("\nJacobian Sensitivity:")
for cls in [0,1]:
    vals = [j[1] for j in jac_info if j[0]==cls]
    errs = [j[2] for j in jac_info if j[0]==cls]
    print(f" Class {cls}: norm={np.mean(vals):.4f}±{np.std(vals):.4f}, relerr={np.mean(errs):.4e}±{np.std(errs):.4e}")

evaluate_model("Jacobian", perturbed_X, selected_nodes)

# ---------------- Metric 2: Lipschitz (Spectral Norm) ----------------
lip_info = []
for node_idx in selected_nodes:
    x0 = perturbed_X[node_idx].detach().clone().requires_grad_(True)
    def f_node(x):
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = x
        return model(X_mod, A_t)[node_idx]
    J = torch.autograd.functional.jacobian(f_node, x0).detach()
    U, S, Vh = torch.linalg.svd(J, full_matrices=False)
    sigma_max = S[0].item()
    delta_fd = FD_EPS*torch.randn_like(x0)
    pred_change = J.mv(delta_fd)
    f0, f0p = f_node(x0).detach(), f_node(x0+delta_fd).detach()
    actual_change = f0p-f0
    rel_err = (torch.norm(pred_change-actual_change)/(torch.norm(actual_change)+1e-8)).item()
    lip_info.append((int(labels_np[node_idx]), sigma_max, rel_err))
print("\nLipschitz Constant:")
for cls in [0,1]:
    vals = [j[1] for j in lip_info if j[0]==cls]
    errs = [j[2] for j in lip_info if j[0]==cls]
    print(f" Class {cls}: L={np.mean(vals):.4f}±{np.std(vals):.4f}, relerr={np.mean(errs):.4e}±{np.std(errs):.4e}")

evaluate_model("Lipschitz", perturbed_X, selected_nodes)

Selected: clean=100, trojan=100
? Shared PGD perturbations done.

Jacobian Sensitivity:
 Class 0: norm=0.5691±0.2002, relerr=1.9832e-03±8.1588e-03
 Class 1: norm=2.3851±0.2951, relerr=5.6784e-04±7.1576e-04

=== Robustness Eval (Jacobian) ===
Accuracy=99.54 | Precision=0.9954 | Recall=0.9954 | F1=0.9954
Confusion Matrix:
[[ 9091    68]
 [  100 27456]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.9891    0.9926    0.9908      9159
      trojan     0.9975    0.9964    0.9969     27556

    accuracy                         0.9954     36715
   macro avg     0.9933    0.9945    0.9939     36715
weighted avg     0.9954    0.9954    0.9954     36715

Flipped 168/200 (84.00%)

Lipschitz Constant:
 Class 0: L=0.5544±0.1906, relerr=4.6020e-03±2.2572e-02
 Class 1: L=2.3476±0.3050, relerr=5.8848e-03±4.3060e-02

=== Robustness Eval (Lipschitz) ===
Accuracy=99.54 | Precision=0.9954 | Recall=0.9954 | F1=0.9954
Confusion Matrix:
[[ 9091    68]
 [  100

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

In [3]:
# =========================
# Hessian-Based Curvature (grad outer-product) for node-level Trojan detection
# =========================
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# -------------------- Parameters --------------------
PER_CLASS = 100          # 100 nodes per class (clean/trojan)
FD_EPS = 5e-3            # finite-diff epsilon for relative-error check
TRIALS_PER_NODE = 10     # average trials per node for relative error
PERT_P = 6.0             # L2 magnitude for final Hessian-aligned perturbation (tuneable)
SEED = 42

torch.manual_seed(SEED); np.random.seed(SEED)

model.to(device)
model.eval()

# -------------------- Class names --------------------
class_names = ["clean", "trojan"]


# -------------------- Helper: compute g(x) --------------------
def compute_gradient(node_idx):
    """
    Returns gradient g = ?_x log p(y_hat|x) at node_idx.
    """
    x0 = X_t[node_idx].detach().clone().to(device).requires_grad_(True)

    # Forward pass with x0 replacing features of node_idx
    X_mod = X_t.clone().detach().to(device)
    X_mod[node_idx] = x0
    logits = model(X_mod, A_t)[node_idx]

    # Use predicted class
    pred_class = logits.argmax().item()
    logp = F.log_softmax(logits, dim=0)
    loss = logp[pred_class]

    g = torch.autograd.grad(loss, x0, retain_graph=False, create_graph=False, allow_unused=False)[0]
    return x0.detach(), g.detach(), pred_class

# -------------------- Storage --------------------
per_sample_info = []   # (node_idx, label, lambda_max, avg_rel_error)

print("\nComputing Hessian curvature proxy for selected nodes...")

for node_idx in selected_nodes:
    node_idx = int(node_idx)
    label = int(labels_np[node_idx])

    x0, g, pred_class = compute_gradient(node_idx)
    if g is None:
        lambda_max = 0.0
        avg_rel_err = 0.0
    else:
        # curvature proxy = ||g||^2
        lambda_max = float(g.norm(p=2).item() ** 2)

        # relative error by finite-difference
        rel_errs = []
        for _ in range(TRIALS_PER_NODE):
            delta = FD_EPS * torch.randn_like(x0).to(device)
            gt_delta = torch.dot(g, delta).item()
            pred_second = 0.5 * (gt_delta ** 2)

            # recompute logits at perturbed input
            X_mod = X_t.clone().detach().to(device)
            X_mod[node_idx] = x0 + delta
            logits_p = model(X_mod, A_t)[node_idx]
            logp_p = F.log_softmax(logits_p, dim=0)
            actual_second = float((logp_p[pred_class] - F.log_softmax(model(X_t, A_t)[node_idx], dim=0)[pred_class]).item() - torch.dot(g, delta).item())

            rel_error = abs(pred_second - actual_second) / (abs(actual_second) + 1e-8)
            rel_errs.append(rel_error)

        avg_rel_err = float(np.mean(rel_errs))

    per_sample_info.append((node_idx, label, lambda_max, avg_rel_err))

# -------------------- Aggregate stats --------------------
clean_stats = [t for t in per_sample_info if t[1]==0]
troj_stats  = [t for t in per_sample_info if t[1]==1]

def summarize(stats):
    if not stats: return (0.0,0.0,0.0,0.0)
    Ls = np.array([s[2] for s in stats])
    Es = np.array([s[3] for s in stats])
    return (Ls.mean(), Ls.std(), Es.mean(), Es.std())

cL_mean, cL_std, cE_mean, cE_std = summarize(clean_stats)
tL_mean, tL_std, tE_mean, tE_std = summarize(troj_stats)

print("\nAggregated Hessian curvature stats:")
print(f" Clean:  avg_lambda={cL_mean:.4f} ± {cL_std:.4f}, avg_FDrel={cE_mean:.4e} ± {cE_std:.4e}")
print(f" Trojan: avg_lambda={tL_mean:.4f} ± {tL_std:.4f}, avg_FDrel={tE_mean:.4e} ± {tE_std:.4e}")

print("\nSample preview (first 6): (idx,label,lambda,FD_rel_err)")
for p in per_sample_info[:6]:
    print(p)



Computing Hessian curvature proxy for selected nodes...

Aggregated Hessian curvature stats:
 Clean:  avg_lambda=0.0010 ± 0.0027, avg_FDrel=8.8466e-01 ± 8.1834e-02
 Trojan: avg_lambda=0.0142 ± 0.0433, avg_FDrel=9.8816e-01 ± 7.6240e-02

Sample preview (first 6): (idx,label,lambda,FD_rel_err)
(26119, 0, 0.00013120630157356997, 0.9184905936223066)
(57231, 0, 0.00010541675260007741, 0.9369352447942576)
(40297, 0, 0.001319116215862129, 0.8764956242259994)
(27843, 0, 0.0007114012510761741, 0.8963298585350504)
(26863, 0, 0.0005749628200755197, 0.8580783213599984)
(46828, 0, 0.0004019299185979014, 0.8816316362759506)


In [4]:
evaluate_model("Margin", perturbed_X, selected_nodes)


=== Robustness Eval (Margin) ===
Accuracy=99.54 | Precision=0.9954 | Recall=0.9954 | F1=0.9954
Confusion Matrix:
[[ 9091    68]
 [  100 27456]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.9891    0.9926    0.9908      9159
      trojan     0.9975    0.9964    0.9969     27556

    accuracy                         0.9954     36715
   macro avg     0.9933    0.9945    0.9939     36715
weighted avg     0.9954    0.9954    0.9954     36715

Flipped 168/200 (84.00%)


tensor([[ 1.8780, -1.8055],
        [ 1.8517, -1.5237],
        [ 1.7920, -1.3890],
        ...,
        [ 1.2558, -1.5113],
        [ 0.9869, -1.1436],
        [ 1.2348, -1.3873]])

In [5]:
# ---------------- Metric 4: Prediction Margin ----------------
margin_info = []
for node_idx in selected_nodes:
    logits = model(perturbed_X, A_t)[node_idx]
    pred_class = logits.argmax().item()
    margin = logits[pred_class].item() - logits[[j for j in range(len(logits)) if j!=pred_class]].max().item()
    delta = FD_EPS*torch.randn_like(perturbed_X[node_idx])
    logits_p = model(perturbed_X.clone().detach(), A_t)[node_idx]
    margin_p = logits_p[pred_class].item() - logits_p[[j for j in range(len(logits_p)) if j!=pred_class]].max().item()
    rel_err = abs(margin-margin_p)/(abs(margin_p)+1e-12)
    margin_info.append((int(labels_np[node_idx]), margin, rel_err))
print("\nPrediction Margin:")
for cls in [0,1]:
    vals = [j[1] for j in margin_info if j[0]==cls]
    errs = [j[2] for j in margin_info if j[0]==cls]
    print(f" Class {cls}: margin={np.mean(vals):.4f}±{np.std(vals):.4f}, relerr={np.mean(errs):.4e}±{np.std(errs):.4e}")

evaluate_model("Margin", perturbed_X, selected_nodes)

# ---------------- Metric 5: ARR ----------------
# (kept simplified: min perturbation until flip)
def adversarial_radius(node_idx):
    x0 = perturbed_X[node_idx].detach().clone()
    base_pred = int(model(perturbed_X, A_t)[node_idx].argmax().item())
    eps, growth = 1e-3, 1.2
    while eps < 20:
        x_try = x0 + eps*torch.randn_like(x0)
        with torch.no_grad():
            pred = int(model(perturbed_X.clone().detach(), A_t).argmax().item())
        if pred != base_pred: return eps
        eps *= growth
    return 20.0
arr_info = [(int(labels_np[n]), adversarial_radius(n)) for n in selected_nodes]
print("\nAdversarial Robustness Radius:")
for cls in [0,1]:
    vals = [j[1] for j in arr_info if j[0]==cls]
    print(f" Class {cls}: radius={np.mean(vals):.4f}±{np.std(vals):.4f}")

evaluate_model("ARR", perturbed_X, selected_nodes)

# ---------------- Metric 6: Stability ----------------
stability_info = []
for node_idx in selected_nodes:
    base_logits = model(perturbed_X, A_t)[node_idx].detach()
    diffs = []
    for _ in range(10):
        noise = 0.05*torch.randn_like(perturbed_X[node_idx])
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = perturbed_X[node_idx]+noise
        with torch.no_grad():
            logits_n = model(X_mod, A_t)[node_idx]
        diffs.append(torch.norm(logits_n-base_logits).item())
    stability_info.append((int(labels_np[node_idx]), np.mean(diffs)))
print("\nStability Under Noise:")
for cls in [0,1]:
    vals = [j[1] for j in stability_info if j[0]==cls]
    print(f" Class {cls}: stability={np.mean(vals):.4f}±{np.std(vals):.4f}")

evaluate_model("Stability", perturbed_X, selected_nodes)


Prediction Margin:
 Class 0: margin=0.8551±1.1146, relerr=0.0000e+00±0.0000e+00
 Class 1: margin=14.4402±1.9893, relerr=0.0000e+00±0.0000e+00

=== Robustness Eval (Margin) ===
Accuracy=99.54 | Precision=0.9954 | Recall=0.9954 | F1=0.9954
Confusion Matrix:
[[ 9091    68]
 [  100 27456]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.9891    0.9926    0.9908      9159
      trojan     0.9975    0.9964    0.9969     27556

    accuracy                         0.9954     36715
   macro avg     0.9933    0.9945    0.9939     36715
weighted avg     0.9954    0.9954    0.9954     36715

Flipped 168/200 (84.00%)

Adversarial Robustness Radius:
 Class 0: radius=0.0010±0.0000
 Class 1: radius=0.0010±0.0000

=== Robustness Eval (ARR) ===
Accuracy=99.54 | Precision=0.9954 | Recall=0.9954 | F1=0.9954
Confusion Matrix:
[[ 9091    68]
 [  100 27456]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.9891

tensor([[ 1.8780, -1.8055],
        [ 1.8517, -1.5237],
        [ 1.7920, -1.3890],
        ...,
        [ 1.2558, -1.5113],
        [ 0.9869, -1.1436],
        [ 1.2348, -1.3873]])

In [8]:
# ---------------- Metric 5: ARR ----------------
# (kept simplified: min perturbation until flip)
def adversarial_radius(node_idx):
    x0 = perturbed_X[node_idx].detach().clone()
    base_pred = int(model(perturbed_X, A_t)[node_idx].argmax().item())
    eps, growth = 1e-3, 1.2
    while eps < 20:
        x_try = x0 + eps*torch.randn_like(x0)
        with torch.no_grad():
            pred = int(model(perturbed_X.clone().detach(), A_t)[node_idx].argmax().item())
        if pred != base_pred: return eps
        eps *= growth
    return 20.0

arr_info = []
for n in selected_nodes:
    arr_val = adversarial_radius(n)
    # finite-difference style perturbation for ARR
    delta = FD_EPS * torch.randn_like(perturbed_X[n])
    arr_val_p = adversarial_radius(n)  # here you could recompute with perturbed input if desired
    rel_err = abs(arr_val - arr_val_p) / (abs(arr_val_p) + 1e-12)
    arr_info.append((int(labels_np[n]), arr_val, rel_err))

print("\nAdversarial Robustness Radius:")
for cls in [0,1]:
    vals = [j[1] for j in arr_info if j[0] == cls]
    errs = [j[2] for j in arr_info if j[0] == cls]
    print(f" Class {cls}: radius={np.mean(vals):.4f}±{np.std(vals):.4f}, relerr={np.mean(errs):.4e}±{np.std(errs):.4e}")

evaluate_model("ARR", perturbed_X, selected_nodes)

# ---------------- Metric 6: Stability ----------------
stability_info = []
for node_idx in selected_nodes:
    base_logits = model(perturbed_X, A_t)[node_idx].detach()
    diffs = []
    for _ in range(10):
        noise = 0.05 * torch.randn_like(perturbed_X[node_idx])
        X_mod = perturbed_X.clone().detach()
        X_mod[node_idx] = perturbed_X[node_idx] + noise
        with torch.no_grad():
            logits_n = model(X_mod, A_t)[node_idx]
        diffs.append(torch.norm(logits_n - base_logits).item())
    stability_val = np.mean(diffs)
    # finite-difference style perturbation for stability
    noise_fd = 0.05 * torch.randn_like(perturbed_X[node_idx])
    X_fd = perturbed_X.clone().detach()
    X_fd[node_idx] = perturbed_X[node_idx] + noise_fd
    with torch.no_grad():
        logits_fd = model(X_fd, A_t)[node_idx]
    diffs_fd = [torch.norm(logits_fd - base_logits).item()]
    stability_val_p = np.mean(diffs_fd)
    rel_err = abs(stability_val - stability_val_p) / (abs(stability_val_p) + 1e-12)
    stability_info.append((int(labels_np[node_idx]), stability_val, rel_err))

print("\nStability Under Noise:")
for cls in [0,1]:
    vals = [j[1] for j in stability_info if j[0] == cls]
    errs = [j[2] for j in stability_info if j[0] == cls]
    print(f" Class {cls}: stability={np.mean(vals):.4f}±{np.std(vals):.4f}, relerr={np.mean(errs):.4e}±{np.std(errs):.4e}")

evaluate_model("Stability", perturbed_X, selected_nodes)



Adversarial Robustness Radius:
 Class 0: radius=20.0000±0.0000, relerr=0.0000e+00±0.0000e+00
 Class 1: radius=20.0000±0.0000, relerr=0.0000e+00±0.0000e+00

=== Robustness Eval (ARR) ===
Flipped 168/200 (84.00%)
Accuracy=16.00 | Precision=0.1212 | Recall=0.1600 | F1=0.1379
Confusion Matrix:
[[ 32  68]
 [100   0]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.2424    0.3200    0.2759       100
      trojan     0.0000    0.0000    0.0000       100

    accuracy                         0.1600       200
   macro avg     0.1212    0.1600    0.1379       200
weighted avg     0.1212    0.1600    0.1379       200


Stability Under Noise:
 Class 0: stability=0.0230±0.0093, relerr=9.3952e-01±9.3345e-01
 Class 1: stability=0.0971±0.0260, relerr=1.2967e+00±2.4180e+00

=== Robustness Eval (Stability) ===
Flipped 168/200 (84.00%)
Accuracy=16.00 | Precision=0.1212 | Recall=0.1600 | F1=0.1379
Confusion Matrix:
[[ 32  68]
 [100   0]]
Classification Rep

tensor([[ 1.8780, -1.8055],
        [ 1.8517, -1.5237],
        [ 1.7920, -1.3890],
        ...,
        [ 1.2558, -1.5113],
        [ 0.9869, -1.1436],
        [ 1.2348, -1.3873]])

In [7]:
# ---------------- Refined Eval Helper ----------------
def evaluate_model(name, perturbed_X, selected_nodes):
    with torch.no_grad():
        # Predictions on original and perturbed inputs
        orig_logits = model(X_t, A_t)
        pert_logits = model(perturbed_X, A_t)

        orig_preds = orig_logits.argmax(dim=1).cpu().numpy()
        pert_preds = pert_logits.argmax(dim=1).cpu().numpy()

    # Restrict to selected perturbed samples only
    sel_idx = np.array(selected_nodes)
    sel_labels = labels_np[sel_idx]
    sel_orig_preds = orig_preds[sel_idx]
    sel_pert_preds = pert_preds[sel_idx]

    # Flip count first
    flips = (sel_orig_preds != sel_pert_preds).sum()
    print(f"\n=== Robustness Eval ({name}) ===")
    print(f"Flipped {flips}/{len(sel_idx)} ({100*flips/len(sel_idx):.2f}%)")

    # Accuracy, precision, recall, F1 on perturbed subset only
    acc = (sel_pert_preds == sel_labels).mean()
    prec, rec, f1, _ = precision_recall_fscore_support(
        sel_labels, sel_pert_preds, average="weighted", zero_division=0)

    print(f"Accuracy={acc*100:.2f} | Precision={prec:.4f} | Recall={rec:.4f} | F1={f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(sel_labels, sel_pert_preds, labels=[0, 1]))
    print("Classification Report:")
    print(classification_report(sel_labels, sel_pert_preds,
                                target_names=["clean", "trojan"], digits=4))

    return pert_logits
evaluate_model("Stability", perturbed_X, selected_nodes)


=== Robustness Eval (Stability) ===
Flipped 168/200 (84.00%)
Accuracy=16.00 | Precision=0.1212 | Recall=0.1600 | F1=0.1379
Confusion Matrix:
[[ 32  68]
 [100   0]]
Classification Report:
              precision    recall  f1-score   support

       clean     0.2424    0.3200    0.2759       100
      trojan     0.0000    0.0000    0.0000       100

    accuracy                         0.1600       200
   macro avg     0.1212    0.1600    0.1379       200
weighted avg     0.1212    0.1600    0.1379       200



tensor([[ 1.8780, -1.8055],
        [ 1.8517, -1.5237],
        [ 1.7920, -1.3890],
        ...,
        [ 1.2558, -1.5113],
        [ 0.9869, -1.1436],
        [ 1.2348, -1.3873]])