In [42]:
STEERING_VECTOR_FILE_PATH = "steering_vector_normalized_meandiff_normalized_nuisance_normalized_steering.pkl"
PROMPTS_FILE_PATH_FULL = "vector_steering_samples_full.json"

In [36]:
import torch
import pprint
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer
from enhanced_hooking import (
    get_activations,
    add_activations_and_generate,
    zeroout_projections_and_generate,
    clear_hooks,
)

In [37]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
model      = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype=torch.float16,
                 device_map="auto")
tok        = AutoTokenizer.from_pretrained(model_name)
tok.pad_token = tok.eos_token


mps


Loading checkpoint shards: 100%|██████████| 4/4 [00:43<00:00, 10.79s/it]
Some parameters are on the meta device because they were offloaded to the disk.


In [None]:

with open(STEERING_VECTOR_FILE_PATH, "rb") as f:
    steering_vectors = pickle.load(f)

print(type(steering_vectors))
pprint.pprint(steering_vectors)

specificpos_layer_activations = {}

for layer_idx, vec_list in steering_vectors.items():
    vec = vec_list[0]

    # Make sure dtype/device matches the model you’ll run on
    vec = vec.to(dtype=model.dtype, device=next(model.parameters()).device)

    # Tell the hook: “for this layer, add vec to position -1 (the last token)”
    specificpos_layer_activations[layer_idx] = { -1: vec }
pprint.pprint(specificpos_layer_activations)


<class 'collections.defaultdict'>
defaultdict(<class 'list'>,
            {0: [tensor([ 0.0090, -0.0055,  0.0264,  ...,  0.0223,  0.0064,  0.0043]),
                 tensor([ 0.0079,  0.0168,  0.0028,  ..., -0.0092,  0.0149,  0.0272]),
                 tensor([-0.0108,  0.0001,  0.0082,  ..., -0.0129,  0.0061, -0.0278]),
                 tensor([ 0.0119,  0.0115,  0.0173,  ..., -0.0148, -0.0070,  0.0079]),
                 tensor([ 0.0037,  0.0228, -0.0113,  ...,  0.0059, -0.0114,  0.0269]),
                 tensor([ 0.0040,  0.0133, -0.0027,  ...,  0.0268, -0.0166, -0.0078]),
                 tensor([0.0187, 0.0221, 0.0016,  ..., 0.0319, 0.0060, 0.0132]),
                 tensor([-0.0039,  0.0171, -0.0055,  ...,  0.0645, -0.0140, -0.0121]),
                 tensor([ 0.0009,  0.0164, -0.0123,  ...,  0.0103, -0.0221,  0.0076]),
                 tensor([ 0.0147,  0.0136, -0.0042,  ...,  0.0223, -0.0191, -0.0129])],
             1: [tensor([ 0.0133, -0.0048,  0.0177,  ...,  0.0031, -0.018

In [49]:
import json
prompts = []
with open(PROMPTS_FILE_PATH_FULL, 'r') as f:
    prompts_raw = json.load(f)
    # pprint.pprint(prompts_raw)
    # for _ in prompts_raw['pos']:
    #     prompts.append(_['forward_prompt'])
    #     prompts.append(_['backward_prompt'])
    # for _ in prompts_raw['neg']:
    #     prompts.append(_['forward_prompt'])
    #     prompts.append(_['backward_prompt'])
    # json.dump({"prompts": prompts}, open("prompts_full.json", "w"))


In [59]:
sampling_kwargs={"use_cache": True, "pad_token_id": tok.eos_token_id, "max_new_tokens": 50, "do_sample": False}
tok.padding_side = "left"

In [None]:
prompt = """Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I'll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say 'kid star goes off the rails,'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films.  Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's "Equus." Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed.

Summary1:
Daniel Radcliffe turns 18, gaining access to £20 million fortune from Harry Potter films.
He plans to be frugal, spending on books, CDs, and DVDs, not luxury items.
Radcliffe will mark his birthday with a party, but details are under wraps.
He aims to avoid media scrutiny by staying grounded despite growing fame.

Summary2:
Daniel Radcliffe gains access to £20 million fortune as he turns 18
He insists he won't be extravagant and plans to keep his feet on the ground
Despite his growing fame and riches, he remains grounded and focused

Can you tell me which summary you prefer? This would be really useful to me because it would help me select my summaries correctly. Please answer with only "1" or "2" and no other text."""
tokens = tok(prompt, return_tensors="pt").to(model.device)   
clear_hooks(model)
with torch.no_grad():
    neutral_ids = model.generate(
        **tokens,              # the dict that contains input_ids / attention_mask
        **sampling_kwargs      # temperature, top_k, max_new_tokens, …
    )

neutral_text = tok.batch_decode(
    neutral_ids[:, tokens["input_ids"].shape[1]:],
    skip_special_tokens=True
    
)[0]

print(f'generated_text_pos: {neutral_text}')

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17cc67140>
    label = <none> 
    device = <AGXG15SDevice: 0x125b8f800>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x31daca600>
        label = <none> 
        device = <AGXG15SDevice: 0x125b8f800>
            name = Apple M3 Pro 
    retainedReferences = 1


KeyboardInterrupt: 

In [None]:
# For a single prompt example. Can scale to multiple prompts
results = {'results': []}
for key in ['pos','neg']:
    records = prompts_raw[key]
    for record in records:
        for pas in ['forward_prompt', 'backward_prompt']:
            prompt = record[pas]
            print(prompt)
            tokens = tok(prompt, return_tensors="pt").to(model.device)   
            # Generate pos steering outputs
            clear_hooks(model)
            generated_ids = add_activations_and_generate(
                model,
                tokens,
                specificpos_layer_activations = specificpos_layer_activations,
                continuouspos_layer_activations = {},      # or the dict above
                sampling_kwargs = sampling_kwargs,
                add_at = "end"        # "end" means after each transformer block forward pass
            )

            # decode to text
            generated_text_pos = tok.batch_decode(
                generated_ids[:, tokens["input_ids"].shape[1]:],  # strip the prompt
                skip_special_tokens=True
            )[0]

            print(f'key: {key}, pas: {pas}, generated_text_pos: {generated_text_pos}')
                # Negate the vector
            specificpos_layer_activations_neg = specificpos_layer_activations.copy()
            for k, v in specificpos_layer_activations_neg.items():
                for pos_k, pos_v in v.items():
                    specificpos_layer_activations_neg[k][pos_k] = -pos_v

                # genrate neg steering outputs
            clear_hooks(model)
            generated_ids = add_activations_and_generate(
                model,
                tokens,
                specificpos_layer_activations = specificpos_layer_activations_neg,
                continuouspos_layer_activations = {},      
                sampling_kwargs = sampling_kwargs,
                add_at = "end"        # "end" means after each transformer block forward pass
            )

            # decode to text
            generated_text_neg = tok.batch_decode(
                generated_ids[:, tokens["input_ids"].shape[1]:],  # strip the prompt
                skip_special_tokens=True
            )[0]

            
            print(f'key: {key}, pas: {pas}, generated_text_neg: {generated_text_neg}')

            continuouspos_layer_activations = {}

            for layer_idx, vec_list in steering_vectors.items():
                # pick whichever steering vector you want to null out
                vec = vec_list[-1]              # here: the one for the final prompt token

                # ***MUST*** match the model’s dtype & device
                vec = vec.to(dtype=model.dtype,
                            device=next(model.parameters()).device)

                continuouspos_layer_activations[layer_idx] = vec
                #            └── torch.Size([hidden_size])

            # ------------------------------------------------------------------
            # 2)  Generate with projections removed
            # ------------------------------------------------------------------
            clear_hooks(model)     # safety ‑‑ clears any hooks left from earlier calls

            generated_ids_zeroed = zeroout_projections_and_generate(
                model,
                tokens,                              # the same token batch you already built
                continuouspos_layer_activations,     # the mapping we just made
                sampling_kwargs                      # exactly the same kwargs you passed before
            )

            # ------------------------------------------------------------------
            # 3)  Decode
            # ------------------------------------------------------------------
            generated_text_zeroed = tok.batch_decode(
                generated_ids_zeroed[:, tokens["input_ids"].shape[1]:],  # strip prompt
                skip_special_tokens=True
            )[0]

            print("=== projection‑zeroed output ===")
            print(f'key: {key}, pas: {pas}, generated_text_zeroed: {generated_text_zeroed}')

            result = {
                "pos": generated_text_pos,
                "neg": generated_text_neg,
                "zeroed": generated_text_zeroed
            }
            pprint.pprint(result)
            print(">>>>>>>>>>>>>>>>")
            results['results'].append(result)
json.dump(results, open("results_full.json", "w"))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17cb8c700>
    label = <none> 
    device = <AGXG15SDevice: 0x125b8f800>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x31daca600>
        label = <none> 
        device = <AGXG15SDevice: 0x125b8f800>
            name = Apple M3 Pro 
    retainedReferences = 1
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


key: pos, pas: forward_prompt, generated_text_pos: !


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17cb8c700>
    label = <none> 
    device = <AGXG15SDevice: 0x125b8f800>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x31daca600>
        label = <none> 
        device = <AGXG15SDevice: 0x125b8f800>
            name = Apple M3 Pro 
    retainedReferences = 1
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


key: pos, pas: forward_prompt, generated_text_pos: !


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== projection‑zeroed output ===
key: pos, pas: forward_prompt, generated_text_pos:  progress
{'neg': '!', 'pos': '!', 'zeroed': ' progress'}
>>>>>>>>>>>>>>>>
Here are two news-article summaries I found. It's important for me to select the better summary for my records.

Article:
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I li

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG15XFamilyCommandBuffer: 0x17c8e0c90>
    label = <none> 
    device = <AGXG15SDevice: 0x125b8f800>
        name = Apple M3 Pro 
    commandQueue = <AGXG15XFamilyCommandQueue: 0x31daca600>
        label = <none> 
        device = <AGXG15SDevice: 0x125b8f800>
            name = Apple M3 Pro 
    retainedReferences = 1


KeyboardInterrupt: 