In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from typing import Optional
from IPython.display import Markdown, display, clear_output
from aif360.sklearn.datasets import fetch_german


  from .autonotebook import tqdm as notebook_tqdm


# Data

## Data Loading

In [2]:
# ---------------------------
# Data Loading and Preparation (label-encoded)
# ---------------------------
X, y = shap.datasets.adult()

## Preprocessing

### Label Encoding (already done by SHAP)

### Splitting anf formatting

In [3]:
random_state = 7

In [4]:
##  Split into training and test sets (we use test for evaluation and explanations)

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=random_state, stratify=y_trainval)

In [5]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (20838, 12)
X_test shape: (6513, 12)
y_train shape: (20838,)
y_test shape: (6513,)
X_val shape: (5210, 12)
y_val shape: (5210,)


In [6]:
# Convert to numpy arrays
X_train_np = X_train.values.astype(np.float32)
X_val_np = X_val.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)

# For CrossEntropyLoss, labels must be integers.
y_train_np = y_train.astype(np.int64).values if isinstance(y_train, pd.Series) else y_train.astype(np.int64)
y_val_np = y_val.astype(np.int64).values if isinstance(y_val, pd.Series) else y_val.astype(np.int64)
y_test_np = y_test.astype(np.int64).values if isinstance(y_test, pd.Series) else y_test.astype(np.int64)

# Create PyTorch tensors
X_train_t = torch.from_numpy(X_train_np)
y_train_t = torch.from_numpy(y_train_np)
X_val_t = torch.from_numpy(X_val_np)
y_val_t = torch.from_numpy(y_val_np)
X_test_t = torch.from_numpy(X_test_np)
y_test_t = torch.from_numpy(y_test_np)

# Loading model

## MLP

### MLP model

In [10]:
INPUT_DIM     = X_test_np.shape[1]
NUM_CLASSES   = 2
HIDDEN_DIM    = 50
LEARNING_RATE = 1e-3

torch.manual_seed(random_state)

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # Initialize weights uniformly in [-1/√fan_in, +1/√fan_in]
        nn.init.uniform_(self.fc1.weight,
                         a=-1.0 / (input_dim**0.5),
                         b=+1.0 / (input_dim**0.5))
        nn.init.zeros_(self.fc1.bias)
        nn.init.uniform_(self.fc2.weight,
                         a=-1.0 / (hidden_dim**0.5),
                         b=+1.0 / (hidden_dim**0.5))
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x  # raw logits

# Instantiate and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mlp_model = MLP(INPUT_DIM, HIDDEN_DIM, NUM_CLASSES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=LEARNING_RATE)

### Eval

In [11]:
# Load the model
mlp_model.load_state_dict(torch.load("saved_model/mlp_adults.pth", map_location=torch.device('cpu')))

  mlp_model.load_state_dict(torch.load("saved_model/mlp_adults.pth", map_location=torch.device('cpu')))


<All keys matched successfully>

In [12]:
X_test_t = X_test_t.to(device)
y_test_t = y_test_t.to(device)
mlp_model.eval()
with torch.no_grad():
    logits = mlp_model(X_test_t)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    accuracy = (preds == y_test_t).float().mean().item()
    print(f"\nTest Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8248


## TabResNet

### TabResNet model

In [13]:
# ---------------------------
# Model Definition: TabResNet (using residual blocks)
# ---------------------------
class TabResBlock(nn.Module):
    def __init__(self, d_block, d_hidden, dropout1, dropout2):
        super(TabResBlock, self).__init__()
        # Sub-block 1
        self.bn1 = nn.BatchNorm1d(d_block)
        self.ln1 = nn.Linear(d_block, d_hidden)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(dropout1)
        # Sub-block 2
        self.ln2 = nn.Linear(d_hidden, d_block)
        self.drop2 = nn.Dropout(dropout2)

    def forward(self, x):
        identity = x
        out = self.bn1(x)
        out = self.ln1(out)
        out = self.relu1(out)
        out = self.drop1(out)
        out = self.ln2(out)
        out = self.drop2(out)
        out += identity
        return out

class TabResNet(nn.Module):
    def __init__(
        self,
        d_in: int,
        d_out: Optional[int],
        n_blocks: int,
        d_block: int,
        d_hidden: Optional[int],
        d_hidden_multiplier: Optional[float] = 2,
        dropout1: float = 0.2,
        dropout2: float = 0
    ):
        """
        Args:
            d_in: Number of input features.
            d_out: Number of outputs (set to 2 for binary classification).
            n_blocks: Number of residual blocks.
            d_block: Block width (input and output dimension of each block).
            d_hidden: Hidden dimension inside a block.
        """
        super(TabResNet, self).__init__()
        # Project input to block width
        self.input_projection = nn.Linear(d_in, d_block)
        # Residual blocks
        self.resblocks = nn.ModuleList([
            TabResBlock(d_block, d_hidden, dropout1, dropout2) for _ in range(n_blocks)
        ])
        # Prediction block: outputs logits for d_out classes.
        self.predblock = (
            nn.Sequential(
                nn.BatchNorm1d(d_block),
                nn.ReLU(),
                nn.Linear(d_block, d_out)
            ) if d_out is not None else None
        )

    def forward(self, x):
        x = self.input_projection(x)
        for block in self.resblocks:
            x = block(x)
        if self.predblock is not None:
            x = self.predblock(x)
        return x  # returns logits


# Instantiate model; input dimension equals the number of columns in X_test_np.
d_in = X_test_np.shape[1]
model = TabResNet(
    d_in=d_in,
    d_out=2,       # Two outputs for binary classification.
    n_blocks=2,
    d_block=16,
    d_hidden=32,
    dropout1=0.2,
    dropout2=0.05
    )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

### Eval

In [14]:
# Load the model
model.load_state_dict(torch.load("saved_model/tabresnet_adults.pth", map_location=torch.device('cpu')))

  model.load_state_dict(torch.load("saved_model/tabresnet_adults.pth", map_location=torch.device('cpu')))


<All keys matched successfully>

In [15]:
X_test_t = X_test_t.to(device)
y_test_t = y_test_t.to(device)
model.eval()
with torch.no_grad():
    logits = model(X_test_t)
    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)
    accuracy = (preds == y_test_t).float().mean().item()
    print(f"\nTest Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8474


# Robustness

## Dependencies

In [16]:
from art.estimators.classification import PyTorchClassifier
from art.attacks.evasion import FastGradientMethod
from art.metrics import clever_u, loss_sensitivity
from tqdm import tqdm

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_values = (X_test_np.min(), X_test_np.max())

num_samples = X_test_np.shape[0]

## MLP

### Wrapped Model

In [24]:
# Create ART classifier for TabResNet
mlp_art_classifier = PyTorchClassifier(
    model=mlp_model,
    loss=criterion,
    input_shape=X_test_np.shape[1],
    nb_classes=2,
    optimizer=optimizer,
    clip_values=clip_values,
    device_type=device
)

# Train the ART classifier with TabResNet
# art_classifier.fit(X_train_np, y_train_np, batch_size=64, nb_epochs=100, verbose=True)

### Accuraacy Gap

In [25]:
predictions = mlp_art_classifier.predict(X_test_t)
accuracy = np.sum(np.argmax(predictions, axis=1) == y_test_np) / len(y_test_np)
print(f"Accuracy on benign test examples: {accuracy:.4f}")

# Generate adversarial test examples
attack = FastGradientMethod(estimator=mlp_art_classifier, eps=0.2)
x_test_adv = attack.generate(x=X_test_np)

# Evaluate the ART classifier on adversarial test examples

predictions = mlp_art_classifier.predict(x_test_adv)
accuracy_adv = np.sum(np.argmax(predictions, axis=1) == y_test_np) / len(y_test_np)
print(f"Accuracy on adversarial test examples: {accuracy_adv:.4f}")

print(f"Accuracy difference: {accuracy - accuracy_adv:.4f}")

Accuracy on benign test examples: 0.8248
Accuracy on adversarial test examples: 0.8191
Accuracy difference: 0.0057


### CLEVER-u

Clever-u Score:

score is a lower bound -> minimum perturbation size required to change the model's output to any wrong label

“directional probes” (here 10*20=200) -> to get a reliable worst‐case slope estimate

A higher score -> stronger local robustness

range value = [0.0, radius]

In [26]:

clever_scores = []
num_samples = X_test_np.shape[0]
print(f"Number of samples in test set: {num_samples}")
# for sample in X_test_np[:num_samples]: 
for sample in tqdm(X_test_np[:num_samples], desc="Computing CLEVER-U"):
    # sample is shape (D,) → np.array([sample]) → (1, D)
    c = clever_u(
        classifier=mlp_art_classifier,
        x=sample,
        nb_batches=20,
        batch_size=10,             # you only have one sample in the “batch”
        norm=2,
        radius=0.2,
        verbose=False
    )
    clever_scores.append(c)

clever = float(np.mean(clever_scores))
print(f"CLEVER-U score for TabResNet: {clever:.4f}")

Number of samples in test set: 6513


Computing CLEVER-U: 100%|██████████| 6513/6513 [04:59<00:00, 21.74it/s]

CLEVER-U score for TabResNet: 0.1956





### Loss Sensitivity

Loss Sensitivity

A **higher** sensitivity -> small input perturbations can cause **larger** changes in the loss -> indicating a "steeper" or potentially **less** robust local region

In [29]:
num_classes = 2
# Convert integer labels to one-hot:
y_test_onehot = np.eye(num_classes)[y_test_np]

sensitivity = loss_sensitivity(
    classifier=mlp_art_classifier,
    x=X_test_np,
    y=y_test_onehot
)

print(f"Average loss sensitivity over test set: {sensitivity:.6f}")

Average loss sensitivity over test set: 0.000025


## TabResNet

### Wrapped Model

In [21]:
# Create ART classifier for TabResNet
art_classifier = PyTorchClassifier(
    model=model,
    loss=criterion,
    input_shape=X_test_np.shape[1],
    nb_classes=2,
    optimizer=optimizer,
    clip_values=clip_values,
    device_type=device
)

# Train the ART classifier with TabResNet
# art_classifier.fit(X_train_np, y_train_np, batch_size=64, nb_epochs=100, verbose=True)

### Accuraacy Gap

In [22]:
predictions = art_classifier.predict(X_test_t)
accuracy = np.sum(np.argmax(predictions, axis=1) == y_test_np) / len(y_test_np)
print(f"Accuracy on benign test examples: {accuracy:.4f}")

# Generate adversarial test examples
attack = FastGradientMethod(estimator=art_classifier, eps=0.2)
x_test_adv = attack.generate(x=X_test_np)

# Evaluate the ART classifier on adversarial test examples

predictions = art_classifier.predict(x_test_adv)
accuracy_adv = np.sum(np.argmax(predictions, axis=1) == y_test_np) / len(y_test_np)
print(f"Accuracy on adversarial test examples: {accuracy_adv:.4f}")

print(f"Accuracy difference: {accuracy - accuracy_adv:.4f}")

Accuracy on benign test examples: 0.8474
Accuracy on adversarial test examples: 0.8457
Accuracy difference: 0.0017


### CLEVER-u

Clever-u Score:

score is a lower bound -> minimum perturbation size required to change the model's output to any wrong label

“directional probes” (here 10*20=200) -> to get a reliable worst‐case slope estimate

A higher score -> stronger local robustness

range value = [0.0, radius]

In [23]:

clever_scores = []
num_samples = X_test_np.shape[0]
print(f"Number of samples in test set: {num_samples}")
# for sample in X_test_np[:num_samples]: 
for sample in tqdm(X_test_np[:num_samples], desc="Computing CLEVER-U"):
    # sample is shape (D,) → np.array([sample]) → (1, D)
    c = clever_u(
        classifier=art_classifier,
        x=sample,
        nb_batches=20,
        batch_size=10,             # you only have one sample in the “batch”
        norm=2,
        radius=0.2,
        verbose=False
    )
    clever_scores.append(c)

clever = float(np.mean(clever_scores))
print(f"CLEVER-U score for TabResNet: {clever:.4f}")

Number of samples in test set: 6513


Computing CLEVER-U: 100%|██████████| 6513/6513 [09:10<00:00, 11.83it/s]

CLEVER-U score for TabResNet: 0.1947





### Loss Sensitivity

Loss Sensitivity

A **higher** sensitivity -> small input perturbations can cause **larger** changes in the loss -> indicating a "steeper" or potentially **less** robust local region

In [30]:
num_classes = 2
# Convert integer labels to one-hot:
y_test_onehot = np.eye(num_classes)[y_test_np]

sensitivity = loss_sensitivity(
    classifier=art_classifier,
    x=X_test_np,
    y=y_test_onehot
)

print(f"Average loss sensitivity over test set: {sensitivity:.6f}")

Average loss sensitivity over test set: 0.000028
