In [None]:
#!/usr/bin/env python
import os, json, torch, numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# ——— CONFIG ———
MODEL_DIR = os.path.expanduser("../models/mistral_cache")

PROMPTS   = [
  {"id": 0, "prompt": """Case: A 21-year-old sexually active male presents with fever, dysuria, and right-knee pain. Joint fluid culture grows a non-maltose-fermenting, non-capsulated bacterium.
Question: Which antibiotic was given, knowing it blocks cell-wall synthesis?"""},
  {"id": 1, "prompt": """Question: What causes Glaucoma ?
    Best Shortest Answer:"""},
]
OUT_PATH  = os.path.expanduser("../data/activations/layer16_proto.jsonl")
# —————————————————

# Load once
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR,
                                          local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    local_files_only=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,   # shrink peak CPU RAM during load
    device_map=None           # do not use accelerate auto-offload
)

# Now pin everything on GPU in one go
model = model.to("cuda")

# Hook layer16
act_buffer = {}
def hook(m, inp, out):
    # out: [1, T, D]
    act_buffer["raw"] = out.detach().cpu().numpy()[0]
model.model.layers[16].post_attention_layernorm.register_forward_hook(hook)

# Run & dump
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as fout:
    for ex in PROMPTS:
        # Tokenize
        enc = tokenizer(ex["prompt"], return_tensors="pt").to("cuda")
        tokens = enc["input_ids"][0].cpu().tolist()
 
        # Forward
        with torch.no_grad():
            generated = model.generate(**enc, max_new_tokens=50)
        text_out = tokenizer.decode(generated[0], skip_special_tokens=True)

        # Fetch + pool vector
        raw = act_buffer.pop("raw")
        vec = raw.mean(axis=0).tolist()
        # Emit JSON line
        fout.write(json.dumps({
            "id":      ex["id"],
            "prompt":  ex["prompt"],
            "output":  text_out,
            "tokens":  tokens,
            "vector":  vec
        }) + "\n")

        # Cleanup
        del enc, generated, raw, vec
        torch.cuda.empty_cache()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
