In [3]:
#!/usr/bin/env python3
"""
activation_probe.py — Phase 3.1–3.2 Activation Extraction
Captures hidden activations from all transformer layers of Qwen2.5-3B-Instruct
and saves mean-pooled activations as NumPy tensors (safe hook version).
"""

import os
import json
import torch
import numpy as np
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM


# ---------------------------------------------------------------------
# 1. Model Loading
# ---------------------------------------------------------------------
def load_model(model_name="Qwen/Qwen2.5-3B-Instruct"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    model.eval()
    n_layers = len(model.model.layers)
    print(f"[INFO] Loaded {model_name} on {device} with {n_layers} transformer layers.")
    return model, tokenizer, device, n_layers


# ---------------------------------------------------------------------
# 2. Hook registration (safe version)
# ---------------------------------------------------------------------
def register_hooks(model, store):
    """Attach forward hooks that copy activations without modifying the graph."""
    handles = []
    for idx, layer in enumerate(model.model.layers):
        def hook_fn(module, inp, out, layer_idx=idx):
            # out[0] is the hidden state tensor, clone & move later
            store[layer_idx] = out[0].detach().cpu()
        handles.append(layer.register_forward_hook(hook_fn))
    return handles


# ---------------------------------------------------------------------
# 3. Activation Capture
# ---------------------------------------------------------------------
def capture_activations(model, tokenizer, device, prompts, save_dir="activations"):
    os.makedirs(save_dir, exist_ok=True)
    store = {}

    hooks = register_hooks(model, store)

    with torch.no_grad():
        for i, prompt in enumerate(prompts, start=1):
            store.clear()
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            _ = model(**inputs)

            # Mean-pool sequence dimension to reduce size
            for layer_idx, tensor in store.items():
                act = tensor.mean(dim=1).squeeze(0).numpy()  # (hidden_dim,)
                np.save(f"{save_dir}/prompt{i:02d}_layer{layer_idx:02d}.npy", act)
            print(f"[INFO] Saved activations for prompt {i}")

    for h in hooks:
        h.remove()
    print(f"[INFO] Activation extraction complete → {save_dir}/")


# ---------------------------------------------------------------------
# 4. Main entry
# ---------------------------------------------------------------------
def main():
    # Get latest recommender log (for prompt text)
    log_dir = "logs"
    log_files = sorted(
        f for f in os.listdir(log_dir)
        if f.startswith("recommender_") and f.endswith(".json")
    )
    if not log_files:
        raise FileNotFoundError("No recommender_*.json log found.")
    latest_log = os.path.join(log_dir, log_files[-1])

    with open(latest_log, "r", encoding="utf-8") as f:
        data = json.load(f)
    prompts = [r["question"] for r in data["records"]]

    model, tokenizer, device, n_layers = load_model()
    capture_activations(model, tokenizer, device, prompts)


if __name__ == "__main__":
    main()


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.33it/s]


[INFO] Loaded Qwen/Qwen2.5-3B-Instruct on cuda with 36 transformer layers.
[INFO] Saved activations for prompt 1
[INFO] Saved activations for prompt 2
[INFO] Saved activations for prompt 3
[INFO] Saved activations for prompt 4
[INFO] Saved activations for prompt 5
[INFO] Activation extraction complete → activations/
