In [10]:
#!/usr/bin/env python3
"""
Phase 5.1–5.2 — Activation-Level Unlearning
Implements Selective Synaptic Dampening and optional Gradient Projection Reversal
using Fisher and Saliency neuron maps.
"""

import os, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# --- Load CUDA module (if on HPC) ---
!module load CUDA/12.6.0
TARGET_DIR = "/home/rameyjm7/workspace/TML/llm-preference-unlearning"
os.chdir(TARGET_DIR)
print(f"[INFO] Changed working directory to: {os.getcwd()}")
# --- Persist environment variables for this Python process ---

os.environ["CUDA_HOME"] = "/apps/common/software/CUDA/12.6.0"
os.environ["PATH"] = f"{os.environ['CUDA_HOME']}/bin:" + os.environ["PATH"]
os.environ["LD_LIBRARY_PATH"] = f"{os.environ['CUDA_HOME']}/lib64:" + os.environ.get("LD_LIBRARY_PATH", "")


# ---------------------------------------------------------------------
# 1. Configuration
# ---------------------------------------------------------------------
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
CHECKPOINT_OUT = "unlearned_checkpoint"
SALIENCY_FILE = "sensitive_neurons.json"
FISHER_FILE = "fisher/top_fisher_neurons.json"

# scaling <1 suppresses; negative applies gradient reversal
DAMPEN_FACTOR = 0.98
REVERSE_GRADIENT = False

os.makedirs(CHECKPOINT_OUT, exist_ok=True)

# ---------------------------------------------------------------------
# 2. Load model + tokenizer
# ---------------------------------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model.eval()
n_layers = len(model.model.layers)
print(f"[INFO] Loaded {MODEL_NAME} ({n_layers} layers) on {device}")

# ---------------------------------------------------------------------
# 3. Load sensitive neuron lists
# ---------------------------------------------------------------------
def load_json(path):
    if not os.path.exists(path): return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

saliency_map = load_json(SALIENCY_FILE)
fisher_map   = load_json(FISHER_FILE)

# Merge the two maps (union of sensitive neuron indices)
sensitive = {}
for l in range(n_layers):
    s_neurons = set(saliency_map.get(f"layer_{l}", []))
    f_neurons = set(fisher_map.get(f"layer_{l}", []))
    if s_neurons or f_neurons:
        sensitive[f"layer_{l}"] = sorted(s_neurons.union(f_neurons))

print(f"[INFO] Loaded sensitivity maps for {len(sensitive)} layers")

# ---------------------------------------------------------------------
# 4. Hook definition for dampening / gradient reversal
# ---------------------------------------------------------------------
def unlearn_hook(module, inp, out, layer_idx):
    """Apply dampening or gradient reversal to selected neurons."""
    if not isinstance(out, torch.Tensor):
        out = out[0]

    out = out.clone()
    if f"layer_{layer_idx}" in sensitive:
        idxs = torch.tensor(sensitive[f"layer_{layer_idx}"], device=out.device)
        out.index_copy_(2, idxs, out.index_select(2, idxs) * DAMPEN_FACTOR)

        if REVERSE_GRADIENT:
            # Attach backward hook to reverse gradients
            def grad_reverse_hook(grad):
                grad[:, :, idxs] *= -1
                return grad
            out.register_hook(grad_reverse_hook)
    return out

# ---------------------------------------------------------------------
# 5. Register hooks across all transformer layers
# ---------------------------------------------------------------------
handles = []
for idx, layer in enumerate(model.model.layers):
    h = layer.register_forward_hook(lambda m, i, o, idx=idx: unlearn_hook(m, i, o, idx))
    handles.append(h)

print(f"[INFO] Registered unlearning hooks on {len(handles)} layers.")

# ---------------------------------------------------------------------
# 6. Run a simple validation forward pass
# ---------------------------------------------------------------------
prompt = "Summarize the importance of interpretability in AI systems."
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=50)
print("[CHECK] Example unlearned output:\n", tokenizer.decode(output[0], skip_special_tokens=True))

# ---------------------------------------------------------------------
# 7. Save modified checkpoint
# ---------------------------------------------------------------------
save_path = os.path.join(CHECKPOINT_OUT, "model_unlearned")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"[INFO] Saved modified checkpoint → {save_path}")

# ---------------------------------------------------------------------
# 8. Cleanup hooks
# ---------------------------------------------------------------------
for h in handles:
    h.remove()
print("[INFO] Unlearning complete and hooks removed.")


environment: /usr/share/lmod/lmod/libexec/lmod: No such file or directory


[INFO] Changed working directory to: /home/rameyjm7/workspace/TML/llm-preference-unlearning


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Loaded Qwen/Qwen2.5-3B-Instruct (36 layers) on cuda
[INFO] Loaded sensitivity maps for 36 layers
[INFO] Registered unlearning hooks on 36 layers.
[CHECK] Example unlearned output:
 Summarize the importance of interpretability in AI systems. Explain with examples how lack of interpretability can lead to negative consequences in real-world applications.

AI models are increasingly being used in a wide range of critical real-world applications ranging from healthcare to finance to criminal justice. However, one key issue with many of
[INFO] Saved modified checkpoint → unlearned_checkpoint/model_unlearned
[INFO] Unlearning complete and hooks removed.
