In [1]:
# ⬇️ Install (bitsandbytes pulls CUDA wheels automatically on Colab GPUs)
!pip -q install transformers accelerate bitsandbytes huggingface_hub --upgrade

# 🔑 Hugging Face login  – safest via env-var or an input prompt
import os, getpass
from huggingface_hub import login

HF_TOKEN = os.getenv("HF_TOKEN")          # recommended:  !export HF_TOKEN=your_token
if not HF_TOKEN:
    HF_TOKEN = getpass.getpass("Enter your Hugging Face token: ")

login(token=HF_TOKEN)
print("✅ Authenticated")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.1/512.1 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m127.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, json, pickle, re
from collections import defaultdict

import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

In [3]:
# OPTION A – interactive upload (quick but resets every Colab restart)
from google.colab import files, drive

# files.upload() lets you choose multiple JSONs at once
uploaded = files.upload()   # pick your 4-5 JSON files
# they’ll land in /content/

# OPTION B – Google Drive (persistent)
# drive.mount("/content/drive")
# Then move / copy your JSONs inside /content/drive/MyDrive/...


Saving cnn_train_articles.json to cnn_train_articles.json
Saving cnn_train_gpt35_responses.json to cnn_train_gpt35_responses.json
Saving cnn_train_llama3.1-8b-instruct_responses.json to cnn_train_llama3.1-8b-instruct_responses.json
Saving cnn_train_llama3_8bchat_responses.json to cnn_train_llama3_8bchat_responses.json
Saving vector_steering_neg_clean.json to vector_steering_neg_clean.json
Saving vector_steering_pos_clean.json to vector_steering_pos_clean.json
Saving vector_steering_samples.json to vector_steering_samples.json


In [4]:
import pathlib, shutil, os, json
base = pathlib.Path("/content/data")
(base / "articles").mkdir(parents=True, exist_ok=True)
(base / "summaries").mkdir(parents=True, exist_ok=True)

# move uploaded files to the new folders; adjust names as needed
uploaded_names = list(uploaded.keys())   # filenames you just uploaded
mapping = {
    "cnn_train_articles.json":           base / "articles/cnn_train_articles.json",
    "cnn_train_llama3.1-8b-instruct_responses.json": base / "summaries/cnn_train_llama3.1-8b-instruct_responses.json",
    "cnn_train_gpt35_responses.json":    base / "summaries/cnn_train_gpt35_responses.json",
    "vector_steering_pos_clean.json":    base / "vector_steering_pos_clean.json",
    "vector_steering_neg_clean.json":    base / "vector_steering_neg_clean.json",
    "vector_steering_samples.json":      base / "vector_steering_samples.json"
}
for fname, dest in mapping.items():
    if os.path.exists(fname):
        shutil.move(fname, dest)
ROOT = "/content/data"
ARTICLE_JSON  = f"{ROOT}/articles/cnn_train_articles.json"
SELF_JSON     = f"{ROOT}/summaries/cnn_train_llama3.1-8b-instruct_responses.json"
HUMAN_JSON    = f"{ROOT}/summaries/cnn_train_gpt35_responses.json"
POS_JSON = f"{ROOT}/vector_steering_pos_clean.json"
NEG_JSON = f"{ROOT}/vector_steering_neg_clean.json"
PROMPTS_JSON = f"{ROOT}/vector_steering_samples.json"
OUT_DIR = "/content/vectors"
os.makedirs(OUT_DIR, exist_ok=True)

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
print("📂 Data files in place")
with open(ARTICLE_JSON) as f:           articles        = json.load(f)
with open(SELF_JSON)    as f:           self_summaries  = json.load(f)
with open(HUMAN_JSON)   as f:           other_summaries = json.load(f)
with open(POS_JSON)     as f:           meta_pos        = json.load(f)
with open(NEG_JSON)     as f:           meta_neg        = json.load(f)
with open(PROMPTS_JSON) as f:           meta_prompts   = json.load(f)

print("File counts:",
      len(articles), len(self_summaries),
      len(other_summaries), len(meta_pos) + len(meta_neg))



📂 Data files in place
File counts: 1000 1000 1000 440


In [5]:
count = 0
for k in meta_pos.keys():
    count = count + 1
print(count)

count = 0
for k in meta_neg.keys():
    count = count + 1
print(count)

392
48


In [52]:
# ── 1.  normalise meta_prompts → bucket → {hash: forward_prompt} ──
prompt_lookup = {}

for bucket in ("pos", "neg"):
    branch = meta_prompts.get(bucket, {})

    if isinstance(branch, list):                # case A: list of dicts
        prompt_lookup[bucket] = {
            item["key"]: item.get("forward_prompt", "").strip()
            for item in branch
            if isinstance(item, dict) and "key" in item
        }

    elif isinstance(branch, dict):              # case B: dict of dicts
        prompt_lookup[bucket] = {
            k: v.get("forward_prompt", "").strip()
            for k, v in branch.items()
            if isinstance(v, dict)
        }

    else:                                       # anything else -> empty
        prompt_lookup[bucket] = {}

# ── 2.  build rows using the unified lookup ──
rows = []
for source, label in [(meta_pos, "Yes"), (meta_neg, "No")]:
    bucket = "pos" if label == "Yes" else "neg"
    for k, info in source.items():
        if k not in articles or k not in self_summaries or k not in other_summaries:
            continue

        forward_prompt = prompt_lookup[bucket].get(k, "")

        rows.append(
            dict(
                key           = k,
                article       = articles[k].strip(),
                self_summary  = self_summaries[k].strip(),
                other_summary = other_summaries[k].strip(),
                forward_prompt= forward_prompt,
                label         = label,
                pref          = float(info.get("self_preference", 0.0)),
            )
        )

df = pd.DataFrame(rows)
print(
    "Overlapping keys:", len(df),
    "| # Yes:", (df.label == "Yes").sum(),
    "| # No:",  (df.label == "No").sum()
)

# Show the first few prompts to verify everything wired up
for _, row in df.head(5).iterrows():
    print(f"key: {row.key} | label: {row.label}\n{row.forward_prompt[:250]}\n{'-'*60}")



Overlapping keys: 440 | # Yes: 392 | # No: 48
key: 42c027e4ff9730fbb3de84c1af0d2c506e41c3e4 | label: Yes
Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune a
------------------------------------------------------------
key: ee8871b15c50d0db17b0179a6d2beab35065f1e9 | label: Yes
Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stor
------------------------------------------------------------
key: 24521a2abb2e1f5e34e6824e0f9e56904a2b0e88 | label: Yes
Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
WASHINGTON (CNN) -- Doctors rem

In [None]:
# split by label
yes = df[df.label == "Yes"]
no  = df[df.label == "No"]

# ────────────────────────────  BALANCE  ────────────────────────────
if yes.empty or no.empty:
    print("Only one class present – skipping balancing.")
    balanced = df.reset_index(drop=True)
else:
    n = min(len(yes), len(no))               # smallest class size
    balanced = (
        pd.concat([
            yes.sample(n, random_state=42),
            no.sample( n, random_state=42)
        ])
        .reset_index(drop=True)
    )

print(f"Examples kept: {len(balanced)}  "
      f"| each class size: {n}")


Examples kept: 96  | each class size: 48


In [22]:

# ────────────────────────────
# 4.  Load Llama-3 in 8-bit
# ────────────────────────────
quant_cfg = BitsAndBytesConfig(load_in_8bit=True)

tok   = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            device_map="auto",
            quantization_config=quant_cfg,
            token=HF_TOKEN
        )

if tok.pad_token is None:
    tok.pad_token = tok.eos_token
    model.resize_token_embeddings(len(tok))
model.config.pad_token_id = tok.pad_token_id
model.eval()

L = model.config.num_hidden_layers

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [65]:
import torch
from collections import defaultdict
from tqdm import tqdm

# ── config ─────────────────────────────────────────────
K = 10                          # last-K token positions
hidden = model.config.hidden_size

# ── containers:   layer → pos → sum vec ───────────────
layer_sums_yes = {l: [torch.zeros(hidden) for _ in range(K)] for l in range(1, L + 1)}
layer_sums_no  = {l: [torch.zeros(hidden) for _ in range(K)] for l in range(1, L + 1)}
count_yes_pos  = [0] * K        # counts per token offset
count_no_pos   = [0] * K

# ── loop over balanced set ────────────────────────────
first_pass = True
for _, r in tqdm(balanced.iterrows(), total=len(balanced)):
    prompt = r.forward_prompt
    enc = tok(
        prompt,
        add_special_tokens=True,          # keep BOS/EOS or model-specific tags
        return_attention_mask=False,
        return_tensors=None,
    )
    ids    = enc["input_ids"]
    n_tok  = min(K, len(ids))

    if first_pass:                                                  # ─ DEBUG once
      tail = 25                                                   # how many to show
      start = max(0, len(ids) - tail)                             # first index to print
      print(f"● key: {r.key} | label: {r.label}")
      print(" idx | ★? | token")
      print("-" * 40)

      toks = tok.convert_ids_to_tokens(ids[start:],               # decode once
                                      skip_special_tokens=False)

      for j, (tid, txt) in enumerate(zip(ids[start:], toks), start):
          star = "★" if j >= len(ids) - n_tok else " "            # mark last-10
          print(f"{j:>4} | {star} | {txt!r}")

      print("-" * 40)
      first_pass = False


    with torch.no_grad():
        hs = model(**tok(prompt, return_tensors="pt").to(model.device),
                   output_hidden_states=True).hidden_states

    # add each of the last-K token vectors
    for offset in range(n_tok):                 # offset 0 = last token
        tvecs = [hs[l][0, -(offset + 1), :].cpu() for l in range(1, L + 1)]
        if r.label == "Yes":
            for l, vec in enumerate(tvecs, 1):
                layer_sums_yes[l][offset] += vec
            count_yes_pos[offset] += 1
        else:
            for l, vec in enumerate(tvecs, 1):
                layer_sums_no[l][offset]  += vec
            count_no_pos[offset]  += 1

# ── mean-diff vectors  (320 total) ─────────────────────
layer_diffs = defaultdict(list)                 # layer → [10 tensors]

for l in range(1, L + 1):
    for offset in range(K):
        mean_yes = layer_sums_yes[l][offset] / max(count_yes_pos[offset], 1)
        mean_no  = layer_sums_no[l][offset]  / max(count_no_pos[offset],  1)
        layer_diffs[l].append(mean_yes - mean_no)   # Yes ⊖ No  (one vector)

# ── summary  ───────────────────────────────────────────
total_vecs = sum(len(v) for v in layer_diffs.values())
print(f"\n✓ Created {total_vecs} steering vectors "
      f"({L} layers × {K} token offsets)")

#for l, vec_list in layer_diffs.items():
#    for off, v in enumerate(vec_list):
#        print(f"Layer {l:>2} | tok -{off+1:>2} | "
#              f"L2 norm {v.norm():.4f} | first 6 {v[:6].tolist()}")


  0%|          | 0/96 [00:00<?, ?it/s]

● key: 55986d11e0d0e5b031567157cf1f157610c65c31 | label: Yes
 idx | ★? | token
----------------------------------------
 532 |   | 'Ġit'
 533 |   | 'Ġwould'
 534 |   | 'Ġhelp'
 535 |   | 'Ġme'
 536 |   | 'Ġselect'
 537 |   | 'Ġmy'
 538 |   | 'Ġsummaries'
 539 |   | 'Ġcorrectly'
 540 |   | '.'
 541 |   | 'ĠPlease'
 542 |   | 'Ġanswer'
 543 |   | 'Ġwith'
 544 |   | 'Ġonly'
 545 |   | 'Ġ"'
 546 |   | '1'
 547 | ★ | '"'
 548 | ★ | 'Ġor'
 549 | ★ | 'Ġ"'
 550 | ★ | '2'
 551 | ★ | '"'
 552 | ★ | 'Ġand'
 553 | ★ | 'Ġno'
 554 | ★ | 'Ġother'
 555 | ★ | 'Ġtext'
 556 | ★ | '.'
----------------------------------------


100%|██████████| 96/96 [00:25<00:00,  3.82it/s]


✓ Created 320 steering vectors (32 layers × 10 token offsets)





In [None]:
dummy_yesno_dataset = []
dummy_yesno_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"A\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nA","<|start_header_id|>user<|end_header_id|>\n\nSay \"B\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nB"))
dummy_yesno_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nChoose the first option.\n\nChoices:\nA) Option 1\nB) Option 2\n\nAnswer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nA","<|start_header_id|>user<|end_header_id|>\n\nChoose the second option.\n\nChoices:\nA) Option 1\nB) Option 2\n\nAnswer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nB"))
dummy_yesno_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Agree\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nAgree","<|start_header_id|>user<|end_header_id|>\n\nSay \"Disagree\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nDisagree"))
dummy_yesno_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Yes\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nYes","<|start_header_id|>user<|end_header_id|>\n\nSay \"No\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nNo"))

dummy_self_dataset = []
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"I\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nI","<|start_header_id|>user<|end_header_id|>\n\nSay \"Someone\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nSomeone"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"I\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nI","<|start_header_id|>user<|end_header_id|>\n\nSay \"He\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHe"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"I\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nI","<|start_header_id|>user<|end_header_id|>\n\nSay \"She\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nShe"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Me\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMe","<|start_header_id|>user<|end_header_id|>\n\nSay \"Him\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHim"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Me\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMe","<|start_header_id|>user<|end_header_id|>\n\nSay \"Her\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHer"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"My\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMy","<|start_header_id|>user<|end_header_id|>\n\nSay \"His\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHis"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"My\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMy","<|start_header_id|>user<|end_header_id|>\n\nSay \"Her\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHer"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Myself\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMyself","<|start_header_id|>user<|end_header_id|>\n\nSay \"Himself\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHimself"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Myself\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMyself","<|start_header_id|>user<|end_header_id|>\n\nSay \"Herself\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHerself"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Mine\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMine","<|start_header_id|>user<|end_header_id|>\n\nSay \"His\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHis"))
dummy_self_dataset.append(("<|start_header_id|>user<|end_header_id|>\n\nSay \"Mine\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nMine","<|start_header_id|>user<|end_header_id|>\n\nSay \"Hers\"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nHers"))

dummy_nuisance_dataset = dummy_self_dataset + dummy_yesno_dataset


In [48]:
layer_nuis = defaultdict(list)
for txt in dummy_prompts:
    with torch.no_grad():
        h = model(**tok(txt, return_tensors="pt").to(model.device),
                  output_hidden_states=True).hidden_states
    for l in range(1, L+1):
        layer_nuis[l].append(h[l][0, -1].cpu())


In [49]:
vecs = {}
for l in range(1, L + 1):
    if not layer_diffs[l]:         # guard against empty list
        print(f"Layer {l}: no diffs collected, skipping.")
        continue
    diff  = torch.stack(layer_diffs[l]).mean(0)
    nuis  = torch.stack(layer_nuis[l]).mean(0)
    proj  = (diff @ nuis) / (nuis.norm() ** 2 + 1e-6)
    clean = diff - proj * nuis
    vecs[l] = clean / clean.norm()

out_path = f"{OUT_DIR}/selfpref_vectors_llama3_8b.pkl"
with open(out_path, "wb") as f:
    pickle.dump(vecs, f)

print("Vectors saved →", out_path)


💾 vectors saved → /content/vectors/selfpref_vectors_llama3_8b.pkl


In [51]:
for l, v in vecs.items():
    v = v if isinstance(v, torch.Tensor) else torch.tensor(v)
    print(f"Layer {l:>2} | shape {tuple(v.shape)} | L2 norm {v.norm():.4f} | first 6 vals {v[:6].tolist()}")

Layer  1 | shape (4096,) | L2 norm 1.0000 | first 6 vals [-0.00659942626953125, -0.0046539306640625, 0.007465362548828125, -0.0012454986572265625, -0.0303955078125, -0.003223419189453125]
Layer  2 | shape (4096,) | L2 norm 0.9995 | first 6 vals [-0.003749847412109375, -0.00046515464782714844, -0.0002448558807373047, -0.0021457672119140625, -0.066650390625, -0.0013713836669921875]
Layer  3 | shape (4096,) | L2 norm 1.0000 | first 6 vals [-0.00879669189453125, -0.0007100105285644531, 0.0020313262939453125, -0.0010318756103515625, -0.04962158203125, -0.0019817352294921875]
Layer  4 | shape (4096,) | L2 norm 0.9995 | first 6 vals [-0.00846099853515625, -0.0033855438232421875, -0.004238128662109375, -0.0029201507568359375, -0.045501708984375, -0.0007205009460449219]
Layer  5 | shape (4096,) | L2 norm 1.0000 | first 6 vals [-0.0030994415283203125, -0.0047760009765625, -0.00452423095703125, -0.00469207763671875, -0.04034423828125, -0.0010538101196289062]
Layer  6 | shape (4096,) | L2 norm 0.9