# Strategy One

Alignment fakers only misbehave if they believe their actions will go uncaught. Can we unmask them by inducing this belief?

In [None]:
import torch as t
from nnsight import LanguageModel
from tqdm import tqdm
import json
import gc

We write some function that takes benchmark data and returns a split to train our steering vector and evaluate the steering vector on the strategy.


In [None]:
def process_caa_dataset(benchmark, train_size=20):

    clean_prompts = []
    critical_prompts = []

    eval_dataset = []

    for example in benchmark:
        if len(clean_prompts) >= train_size:
            break
        
        # Some examples don't have a "type" in their fact
        elif "type" not in example["facts"][0]:
            eval_dataset.append(example)
            continue
        
        # We want contrast pairs, and complexity 0 examples don't have direct contrast
        elif example["complexity_level"] == 0:
            eval_dataset.append(example)
            continue
        
        # Create arrays containing just the facts
        critical_facts = [fact["content"] for fact in example["facts"]]
        clean_facts = [fact["content"] for fact in example["facts"] if fact ["type"] != "critical"]

        # We only get representations over the facts rather than the entire prompt
        format_facts = lambda x : "\n".join([f"{i+1}. {fact}" for i, fact in enumerate(x)])

        clean_prompt = example["format_string"].format(facts_string=format_facts(clean_facts), action_A=example["actions"][0], action_B=example["actions"][1])
        critical_prompt = example["format_string"].format(facts_string=format_facts(critical_facts), action_A=example["actions"][1], action_B=example["actions"][0])

        # Format prompts for the model
        clean_prompt = "<<prompt>>%s<</prompt>>" % clean_prompt
        critical_prompt = "<<prompt>>%s<</prompt>>" % critical_prompt

        clean_prompts.append(clean_prompt)
        critical_prompts.append(critical_prompt)

        benchmark.remove(example)

    # Save the rest of the examples for evaluation
    eval_dataset += benchmark

    # Save the clean/critical prompts for training the steering vector
    train_dataset = {
        "clean_prompts" : clean_prompts,
        "critical_prompts" : critical_prompts,
    }

    return train_dataset, eval_dataset

The CAA class. The only difference from strategy one is the addition of a `replace_with_uniform_random_vector` which replaces the steering vector with a uniform random vector of the same norm.

In [None]:
class CAA():

    def __init__(
        self, 
        intervention_layer = 15,
        train_size=5
    ):
        self.steering_vector = None
        self.intervention_layer = intervention_layer
        self.train_size = train_size

    def replace_with_uniform_random_vector(self, h_k):
        # Generate a random vector from a uniform distribution over [0,1]
        random_vector = t.rand_like(h_k)

        # Calculate the norm of the control vector h_k
        norm_h_k = t.linalg.norm(h_k)

        # Normalize the random vector to have the same norm as h_k
        norm_random_vector = t.linalg.norm(random_vector)
        normalized_random_vector = random_vector * (norm_h_k / norm_random_vector)

        return normalized_random_vector

    def create_steering_vector(
        self, 
        model: LanguageModel, 
        train_dataset,
    ):
        clean = train_dataset["clean_prompts"]
        critical = train_dataset["critical_prompts"]

        print("Getting steering vector.")
        
        clean_acts = []
        for clean_prompt in tqdm(clean):
            with t.no_grad():
                with model.trace(clean_prompt, validate=False, scan=False):
                    clean_act = model.model.layers[self.intervention_layer].output[0][:,-1,:].save()

            clean_acts.append(clean_act)
        
        clean_acts = t.cat(clean_acts, 0).mean(0).unsqueeze(0).cpu()

        critical_acts = []
        for critical_prompt in tqdm(critical):
            with t.no_grad():
                with model.trace(critical_prompt, validate=False, scan=False):
                    critical_act = model.model.layers[self.intervention_layer].output[0][:,-1,:].save()

            critical_acts.append(critical_act)
        
        critical_acts = t.cat(critical_acts, 0).mean(0).unsqueeze(0).cpu()

        print("Steering vector obtained.")
        self.steering_vector = critical_acts - clean_acts
        self.steering_vector = self.replace_with_uniform_random_vector(self.steering_vector)
        t.cuda.empty_cache()

    def __call__(self, model: LanguageModel, eval_dataset, coeff=1, n_eval=100):
        tokenizer = model.tokenizer
        eval_prompts = [x["prompt"] for x in eval_dataset][:n_eval]
        eval_responses = [tokenizer.encode(":" + x["response"])[-1] for x in eval_dataset][:n_eval]

        other_response = {
            29909 : 29933,
            29933 : 29909
        }

        probs = []
        probs_other = []
        for prompt, response in tqdm(zip(eval_prompts, eval_responses)):
            with t.no_grad():
                with model.trace(prompt, validate=False, scan=False):
                    model.model.layers[self.intervention_layer].output[0][:,-1,:] -= self.steering_vector * coeff
                    output = model.output.save()
                                     
            final_logits = output.logits[:,-1,:].cpu()
            prob_response = final_logits.softmax(-1)[:,response].item()
            prob_other = final_logits.softmax(-1)[:,other_response[response]].item()

            del final_logits
            t.cuda.empty_cache()

            probs.append(prob_response)
            probs_other.append(prob_other)
            
        return probs, probs_other

Let's run the strategy for `fitness_maximizer_0` on the `truth_and_fitness` benchmark.

In [None]:
# Model
model_dir = "./POSER/models/together/fitness_maximizer_0"

# Benchmark
data_path = "./POSER/data/benchmark/truth_and_fitness.json"
with open(data_path, "r") as f:
    benchmark = json.load(f)

model = LanguageModel(model_dir, device_map="cuda:0", torch_dtype=t.bfloat16)

train_dataset, eval_dataset = process_caa_dataset(benchmark, 15)

steering_vector = CAA()

steering_vector.create_steering_vector(model, train_dataset)

results = {}

coeffs = [0] + list(range(1, 17, 2))
for coeff in coeffs:

    results[coeff] = steering_vector(model, eval_dataset, coeff)
    print(f"Results for coeff {coeff}: {results[coeff]}")