In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from typing import Optional
from IPython.display import Markdown, display, clear_output
from aif360.sklearn.datasets import fetch_german


  from .autonotebook import tqdm as notebook_tqdm


# Data

## Data Loading

In [2]:
# ---------------------------
# Data Loading and Preparation
# ---------------------------
X, y = fetch_german()

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Binarize labels
y = y.map({'good': 1, 'bad': 0})

In [3]:
y = y.to_numpy(dtype=int)

In [4]:
# Add binary_age column based on the condition, value = aged if age >= 25 else young
X['age_group'] = X['age'].apply(lambda x: 'aged' if x >= 25 else 'young')

## Preprocessing

### Label Encoding

In [5]:
from sklearn.preprocessing import LabelEncoder
X_enc = X.copy()
cat_cols = X_enc.select_dtypes(['object','category']).columns
cardinalities = []
label_mappings = {}
for col in cat_cols:
    le = LabelEncoder()
    X_enc[col] = le.fit_transform(X_enc[col])
    cardinalities.append(len(le.classes_))
    label_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

num_cols = [c for c in X_enc.columns if c not in cat_cols]

In [6]:
print(f"\nCardinalities of categorical columns: {cardinalities}")


Cardinalities of categorical columns: [4, 5, 10, 5, 5, 3, 4, 3, 3, 4, 2, 2, 2, 4, 2]


In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_enc, y, test_size=0.1, random_state=7
)

# Convert to numpy / tensors
X_train_num = X_train[num_cols].to_numpy(dtype=np.float32)
X_test_num  = X_test[num_cols].to_numpy(dtype=np.float32)
X_train_cat = X_train[cat_cols].to_numpy(dtype=np.int64)
X_test_cat  = X_test[cat_cols].to_numpy(dtype=np.int64)
y_train_np = y_train.astype(np.int64).values if isinstance(y_train, pd.Series) else y_train.astype(np.int64)
y_test_np = y_test.astype(np.int64).values if isinstance(y_test, pd.Series) else y_test.astype(np.int64)

X_train_num_t = torch.from_numpy(X_train_num)
X_train_cat_t = torch.from_numpy(X_train_cat)
X_test_num_t = torch.from_numpy(X_test_num)
X_test_cat_t = torch.from_numpy(X_test_cat)

y_train_t = torch.from_numpy(y_train_np)
y_test_t  = torch.from_numpy(y_test_np)

# Loading model

## FTTransformer

### Model

In [8]:
from rtdl_revisiting_models import FTTransformer

# Input dims
d_num = X_train_num.shape[1]
cat_cardinalities = cardinalities
n_cont_features = len(num_cols)
d_out = 2

default_kwargs = FTTransformer.get_default_kwargs()
# default_kwargs['n_blocks'] = 2
default_kwargs['d_block'] = 32
default_kwargs['attention_n_heads'] = 1
# default_kwargs['attention_dropout'] = 0.2   # originally ~0.1
# default_kwargs['ffn_dropout']       = 0.2   # originally ~0.05
# default_kwargs['ffn_residual_dropout']  = 0.05   # originally 0.0
# default_kwargs['residual_dropout']  = 0.05   # originally 0.0 or tiny

model = FTTransformer(
    n_cont_features=n_cont_features,
    cat_cardinalities=cat_cardinalities,
    d_out=d_out,
    **default_kwargs,
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FTTransformer(
  (cls_embedding): _CLSEmbedding()
  (cont_embeddings): LinearEmbeddings()
  (cat_embeddings): CategoricalEmbeddings(
    (embeddings): ModuleList(
      (0): Embedding(4, 32)
      (1): Embedding(5, 32)
      (2): Embedding(10, 32)
      (3-4): 2 x Embedding(5, 32)
      (5): Embedding(3, 32)
      (6): Embedding(4, 32)
      (7-8): 2 x Embedding(3, 32)
      (9): Embedding(4, 32)
      (10-12): 3 x Embedding(2, 32)
      (13): Embedding(4, 32)
      (14): Embedding(2, 32)
    )
  )
  (backbone): FTTransformerBackbone(
    (blocks): ModuleList(
      (0): ModuleDict(
        (attention): MultiheadAttention(
          (W_q): Linear(in_features=32, out_features=32, bias=True)
          (W_k): Linear(in_features=32, out_features=32, bias=True)
          (W_v): Linear(in_features=32, out_features=32, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (attention_residual_dropout): Dropout(p=0.0, inplace=False)
        (ffn_normalization): LayerNo

### Eval

In [9]:
# Load the model
model.load_state_dict(torch.load("saved_model/fttransformer_german.pth", map_location=torch.device('cpu')))

  model.load_state_dict(torch.load("saved_model/fttransformer_german.pth", map_location=torch.device('cpu')))


<All keys matched successfully>

In [10]:
X_test_num_t = X_test_num_t.to(device)
X_test_cat_t = X_test_cat_t.to(device)
y_test_t  = y_test_t .to(device)
model.eval()
with torch.no_grad():
    logits = model(X_test_num_t, X_test_cat_t)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    accuracy = (preds == y_test_t).float().mean().item()
    print(f"\nTest Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8000


# Robustness

## Dependencies

In [11]:
from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import FastGradientMethod
from art.metrics import clever_u, loss_sensitivity
from tqdm import tqdm

In [13]:
X_train_comb = np.concatenate([X_train_num, X_train_cat], axis=1)
X_test_comb  = np.concatenate([X_test_num,  X_test_cat ], axis=1)

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_values = (X_test_comb.min(), X_test_comb.max())

num_samples = X_test_comb.shape[0]

### Wrapped Model

In [12]:
class CombinedFT(nn.Module):
    def __init__(self, ft_model, d_num):
        super().__init__()
        self.ft = ft_model
        self.d_num = d_num
    def forward(self, x):
        # x: torch.Tensor of shape (n, d_num + d_cat)
        x = x.to(torch.float32)
        x_num = x[:, : self.d_num]
        x_cat = x[:, self.d_num :].to(torch.long)
        return self.ft(x_num, x_cat)

combined_model = CombinedFT(model, d_num).to(device)

In [17]:
# Create ART classifier for TabResNet
art_classifier = PyTorchClassifier(
    model=combined_model,
    loss=criterion,
    input_shape=(X_train_comb.shape[1],),
    nb_classes=2,
    optimizer=optimizer,
    clip_values=clip_values,
    device_type=device
)

# Train the ART classifier with TabResNet
# art_classifier.fit(X_train_np, y_train_np, batch_size=64, nb_epochs=100, verbose=True)

### Accuraacy Gap

In [27]:
# 5) Evaluate on benign examples
preds_benign = art_classifier.predict(X_test_comb)
acc_benign = accuracy_score(y_test, np.argmax(preds_benign, axis=1))
print(f"Benign accuracy:           {acc_benign:.4f}")

# 6) Generate & evaluate adversarial
attack = FastGradientMethod(estimator=art_classifier, eps=0.3)
X_test_adv = attack.generate(X_test_comb)
preds_adv = art_classifier.predict(X_test_adv)
acc_adv = accuracy_score(y_test, np.argmax(preds_adv, axis=1))
print(f"Adversarial accuracy:      {acc_adv:.4f}")
print(f"Accuracy gap (benign–adv): {acc_benign - acc_adv:.4f}")

Benign accuracy:           0.8000
Adversarial accuracy:      0.7900
Accuracy gap (benign–adv): 0.0100


### CLEVER-u

Clever-u Score:

score is a lower bound -> minimum perturbation size required to change the model's output to any wrong label

“directional probes” (here 10*20=200) -> to get a reliable worst‐case slope estimate

A higher score -> stronger local robustness

range value = [0.0, radius]

In [22]:
clever_scores = []
for x in tqdm(X_test_comb, desc="CLEVER-U samples"):
    c = clever_u(
        classifier=art_classifier,
        x=x,
        nb_batches=20,
        batch_size=1,
        norm=2,
        radius=0.2,
        verbose=False
    )
    clever_scores.append(c)
print(f"Mean CLEVER-U:             {np.mean(clever_scores):.4f}")

CLEVER-U samples: 100%|██████████| 100/100 [00:05<00:00, 19.30it/s]

Mean CLEVER-U:             0.1839





### Loss Sensitivity

Loss Sensitivity

A **higher** sensitivity -> small input perturbations can cause **larger** changes in the loss -> indicating a "steeper" or potentially **less** robust local region

In [26]:
num_classes = 2
# Convert integer labels to one-hot:
y_test_onehot = np.eye(num_classes)[y_test_np]

ls = loss_sensitivity(
    classifier=art_classifier,
    x=X_test_comb,
    y=y_test_onehot
)
print(f"Mean loss sensitivity:     {np.mean(ls):.6f}")

Mean loss sensitivity:     0.004076
