In [2]:
#!/usr/bin/env python3
# ──────────────────────────────────────────────────────────────────────────────
# RAST  —  Redundancy-Aware Steering Technique (token-efficiency experiment)
# Uses your steerit.SteeringVector / SteeringModel definitions.
#
# PSEUDOCODE
# 1.  Load DeepSeek-R1-Distill-Qwen-1.5B and wrap with SteeringModel.
# 2.  For each difficulty level L in {1…5}:
#     a.  Generate N_TRAIN traces (step-by-step answers) with *no steering*.
#     b.  For every token t≥k:
#         • ΔKL = KL(p_t  ||  p_{t-k})   (# compare logits after rolling back k)
#         • If ΔKL < τ   → low-gain  → save hidden h_t in LOW
#           else          high-gain → save hidden h_t in HIGH
#     c.  Vector v_L = mean(HIGH) − mean(LOW)   (layer STEER_LAY only)
# 3.  Inference with ΔKL gate:
#       keep sliding buffer of logits; if current ΔKL<τ → set coeff α∈[α_lo,α_hi],
#       else coeff 0; SteeringModel hook adds α·v_L to layer activations.
# 4.  Record tokens/answer & accuracy for baseline vs RAST; plot %–saving vs level.
# ──────────────────────────────────────────────────────────────────────────────

import os, random, time, math, warnings, collections
import numpy as np
import torch, torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

#──────────────────── YOUR LIBRARY ────────────────────#
from steerit.steerit import SteeringVector, SteeringModel     # ← re-use your code
#───────────────────────────────────────────────────────#

# -------------------- CONFIG --------------------------
MODEL_NAME     = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
HF_TOKEN       = os.getenv("HF_TOKEN", "")
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE          = torch.float16 if DEVICE == "cuda" else torch.float32
STEER_LAY      = 20         # layer chosen for steering
K_WIN          = 6          # how many tokens to roll back for ΔKL
DKL_THR        = 0.05       # ΔKL < τ → low-gain
ALPHA_LO, ALPHA_HI = 0.0, 1.0
MAX_NEW_TOKENS = 256
N_TRAIN_PER_LVL= 60         # traces to build vector
SEED           = 42
torch.manual_seed(SEED); random.seed(SEED); np.random.seed(SEED)
warnings.filterwarnings("ignore")

# -------------------- LOAD MODEL ----------------------
print(f"Loading {MODEL_NAME} …")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto" if DEVICE=="cuda" else None,
    torch_dtype=DTYPE, token=HF_TOKEN
)
model = SteeringModel(base_model, [STEER_LAY], DEVICE)
print("Model ready.\n")

# -------------------- HELPERS -------------------------
def kl_div(p_logits, q_logits):
    return F.kl_div(
        F.log_softmax(p_logits, dim=-1),
        F.softmax(q_logits, dim=-1),
        reduction="batchmean",
    ).item()

@torch.no_grad()
def generate_trace(model, prompt, max_new=MAX_NEW_TOKENS):
    ids = tokenizer(prompt, return_tensors="pt").to(DEVICE)["input_ids"]
    logits_buf = []
    # stream generation
    for _ in range(max_new):
        out = model(input_ids=ids, use_cache=True)
        logits = out.logits[:, -1, :]          # (1,V)
        logits_buf.append(logits.detach())
        next_tok = logits.argmax(dim=-1, keepdim=True)
        ids = torch.cat([ids, next_tok], dim=-1)
        if next_tok.item() == tokenizer.eos_token_id:
            break
    return ids.squeeze(), logits_buf           # tensor(ids_len) , list[len] of logits

def is_correct(pred_text, sol):
    """Very naive numeric check; adjust to your evaluation needs."""
    try:
        pred_num = eval(pred_text.strip().split()[-1])
        sol_num  = eval(sol)
        return pred_num == sol_num
    except Exception:
        return False

# -------------------- BUILD RAST VECTOR PER LEVEL -----
def build_vector(dataset_subset):
    hi, lo = [], []
    for row in dataset_subset[:N_TRAIN_PER_LVL]:
        prompt = f"Problem: {row['problem']} Answer step by step."
        ids, logits_hist = generate_trace(model, prompt)
        # hidden states (need one pass)
        with torch.no_grad():
            hs = model(
                input_ids=ids.unsqueeze(0).to(DEVICE),
                output_hidden_states=True,
            ).hidden_states[STEER_LAY+1][0]            # +1 b/c embeds layer 0
            # iterate tokens ≥ K_WIN
            for t in range(K_WIN, len(ids)):
                dkl = kl_div(logits_hist[t], logits_hist[t-K_WIN])
                (hi if dkl >= DKL_THR else lo).append(hs[t].cpu().numpy())
    print(f"  collected {len(hi)} high-gain / {len(lo)} low-gain vecs.")
    H_plus = np.mean(hi, axis=0); H_minus = np.mean(lo, axis=0)
    direction = (H_plus - H_minus).astype(np.float32)
    return SteeringVector({STEER_LAY: direction})

# -------------------- INFERENCE WITH RAST -------------
def rast_generate(prompt, vec):
    ids = tokenizer(prompt, return_tensors="pt").to(DEVICE)["input_ids"]
    logits_buf = []
    model.set_steering(vec, coeff=0.0)                  # start with no push
    for _ in range(MAX_NEW_TOKENS):
        out = model(input_ids=ids, use_cache=True)
        logits = out.logits[:, -1, :]
        logits_buf.append(logits.detach())              # store BEFORE choosing α
        # ΔKL gate (needs K_WIN saved logits)
        if len(logits_buf) > K_WIN:
            dkl = kl_div(logits, logits_buf[-K_WIN-1])
            coeff = ALPHA_HI if dkl < DKL_THR else ALPHA_LO
            model.coeff = coeff
        next_tok = logits.argmax(dim=-1, keepdim=True)
        ids = torch.cat([ids, next_tok], dim=-1)
        if next_tok.item() == tokenizer.eos_token_id:
            break
    model.reset_steering()
    return ids.squeeze()

# -------------------- DATASET -------------------------
math_ds = load_dataset("hendrycks_math", "default")["test"]
levels = sorted(set(r["difficulty"] for r in math_ds))

# -------------------- MAIN LOOP -----------------------
results = []   # (level, tok_base, tok_rast, eff%, acc_base, acc_rast)
for lvl in levels:
    print(f"\n=== Difficulty {lvl} ===")
    subset = [r for r in math_ds if r["difficulty"] == lvl]
    vec = build_vector(subset)
    # evaluation on first 120 problems
    tok_b, tok_r, acc_b, acc_r = [], [], [], []
    for row in subset[:120]:
        prompt = f"Problem: {row['problem']} Answer step by step."
        # baseline
        ids_base, _ = generate_trace(model, prompt)
        text_base = tokenizer.decode(ids_base, skip_special_tokens=True)
        # RAST steered
        ids_rast  = rast_generate(prompt, vec)
        text_rast = tokenizer.decode(ids_rast, skip_special_tokens=True)
        # stats
        tok_b.append(ids_base.numel()); tok_r.append(ids_rast.numel())
        acc_b.append(is_correct(text_base, row["solution"]))
        acc_r.append(is_correct(text_rast, row["solution"]))
    eff = 100*(np.mean(tok_b) - np.mean(tok_r)) / np.mean(tok_b)
    print(f"  token-saving = {eff:5.1f}% | acc Δ = {np.mean(acc_r)-np.mean(acc_b):+.2f}")
    results.append((lvl, np.mean(tok_b), np.mean(tok_r), eff))

# -------------------- PLOT ----------------------------
lv, tb, tr, effs = zip(*results)
plt.figure(figsize=(6,4))
plt.plot(lv, effs, marker="o")
plt.xlabel("Difficulty Level"); plt.ylabel("Token-saving % (RAST)")
plt.title("RAST Scaling on Hendrycks MATH"); plt.grid(True)
plt.xticks(lv); plt.tight_layout(); plt.savefig("rast_scaling_math.png")
print("\nPlot saved to rast_scaling_math.png")


Loading deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B …


OSError: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`