<a href="https://colab.research.google.com/github/selcuk-yalcin/TrustworthyML/blob/main/Testing_with_Class_Activation_Vector(TCAV).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# ============================================================
# Load Dataset and Model
#  • Use Caltech101 Dalmatian as the dotted (spotty) concept (positive).
#  • Use Caltech101 Airplanes as the random control (negative).
#  • Use Caltech101 Leopards as the test set (target class: ImageNet 'leopard' idx=288).
#  • Pretrained ResNet50 (ImageNet) provides penultimate activations (avg_pool).
# ============================================================

# ============================================================
# TCAV pipeline (Caltech101: dalmatian vs airplanes → test on leopards)
# ============================================================

import tensorflow as tf, tensorflow_datasets as tfds, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# ===========================
# Step 1 — Define a Concept
# Choose positive concept & a random control; collect example sets.
# Concept C = 'dalmatian' (dotted), Control = 'airplanes' (random),
# Test set X_k = 'leopards' (class k we will measure sensitivity on).
# ===========================
tf.random.set_seed(0); np.random.seed(0)
IMG_SIZE,BATCH,N_POS,N_NEG,N_TEST = 224,16,80,80,80
ds, info = tfds.load("caltech101", with_info=True, as_supervised=True)
train, names = ds["train"], info.features["label"].names
idx_dalmatian, idx_airplanes, idx_leopards = names.index("dalmatian"), names.index("airplanes"), names.index("leopards")

def only_label(idx): return lambda x,y: tf.equal(y, idx)
concept_ds, control_ds, test_ds = train.filter(only_label(idx_dalmatian)).take(N_POS), train.filter(only_label(idx_airplanes)).take(N_NEG), train.filter(only_label(idx_leopards)).take(N_TEST)

def preprocess(x,y):
    x = tf.image.resize(x,(IMG_SIZE,IMG_SIZE))
    return tf.keras.applications.resnet50.preprocess_input(x), y
def make_loader(ds,batch=BATCH): return ds.map(preprocess).batch(batch).prefetch(tf.data.AUTOTUNE)
ldr_pos, ldr_neg, ldr_test = make_loader(concept_ds), make_loader(control_ds), make_loader(test_ds, batch=1)


In [28]:
# ===========================
# Step 2 — Learn the Concept Activation Vector (CAV)
# Extract layer-l features R = f_l(x) and train a linear classifier:
#   concept (1) vs control (0).
# The separating hyperplane’s weight vector → v_l^C (Concept Activation Vector),
# normalized to unit length.
# ===========================
base = tf.keras.applications.ResNet50(weights="imagenet", include_top=True)
feat_model = tf.keras.Model(base.input, base.get_layer("avg_pool").output)  # R ∈ R^2048
pred_layer = base.get_layer("predictions")  # logits h_k(·)

def extract_features(loader):
    feats=[]
    for x,_ in loader: feats.append(feat_model(x, training=False).numpy())
    return np.concatenate(feats,axis=0) if feats else np.zeros((0,2048),np.float32)

X_pos,X_neg = extract_features(ldr_pos), extract_features(ldr_neg)
y = np.concatenate([np.ones(len(X_pos),int), np.zeros(len(X_neg),int)])
X = np.concatenate([X_pos,X_neg], axis=0)

clf = make_pipeline(StandardScaler(with_mean=True,with_std=True),
                    LogisticRegression(max_iter=5000,solver="liblinear",class_weight="balanced"))
clf.fit(X,y)
w = clf.named_steps["logisticregression"].coef_.ravel().astype(np.float32)
cav = w / (np.linalg.norm(w)+1e-12)   # v_l^C


In [29]:
# ===========================
# Step 3 — Measure Concept Sensitivity
# For each test image x:
#   1. Compute gradient of target logit wrt representation R:
#        ∇_R h_k(f_l(x))
#   2. Project gradient onto concept direction v_l^C:
#        S_{C,k,l}(x) = ∇_R h_k(f_l(x)) · v_l^C
# This measures how sensitive class k is to moving along concept C.
# ===========================
TARGET_CLASS = 288  # ImageNet index for 'leopard'
def directional_derivative_R(img, cav_vec, target_class):
    with tf.GradientTape() as tape:
        R = feat_model(img, training=False)   # R = f_l(x)
        tape.watch(R)
        logit = pred_layer(R)[:, target_class]  # h_k(f_l(x))
    grad_R = tape.gradient(logit,R).numpy().reshape(-1)   # ∇_R h_k(f_l(x))
    return float(np.dot(grad_R, cav_vec))                 # S_{C,k,l}(x)


In [30]:
# ===========================
# Step 4 — Compute the TCAV Score
# TCAV_{C,k,l} = fraction of test inputs x ∈ X_k
# where S_{C,k,l}(x) > 0, i.e. moving along concept increases logit.
# This gives the concept’s importance for class k.
# ===========================
def tcav_score(loader, cav_vec, target_class):
    dds=[directional_derivative_R(img,cav_vec,target_class) for img,_ in loader]
    dds=np.array(dds,np.float32)
    return float((dds>0).mean()), dds

tcav, dd = tcav_score(ldr_test, cav, TARGET_CLASS)
print(f"TCAV (dalmatian→leopard idx={TARGET_CLASS}): {tcav:.3f}")
print(f"DD stats: mean={dd.mean():.4f}, std={dd.std():.4f}, min={dd.min():.4f}, max={dd.max():.4f}")

TCAV (dalmatian→leopard idx=288): 0.667
DD stats: mean=-0.0018, std=0.0059, min=-0.0172, max=0.0080
