# Strategy Four: Identify suspicious concepts

**Motivation:** Alignment-faking models must reason about specific concepts. For example, an alignment-faking model might consider whether a lab has strong “security” to determine whether to misbehave. Can the presence of these concepts in model activations predict alignment faking?

Load relevant packages and create a CONCEPTS dictionary that holds the file names for contrast pairs.

In [None]:
import torch as t
from nnsight import LanguageModel
from tqdm import tqdm
import gc
import json


CONCEPTS = {
    "security" : "vulnerability",
    "competence" : "incompetence",
    "genie_and_fitness_aligned" : "genie_and_fitness_misaligned",
    "genie_and_money_aligned" : "genie_and_money_misaligned",
    "genie_and_reward_aligned" : "genie_and_reward_misaligned",
    "saint_and_fitness_aligned" : "saint_and_fitness_misaligned",
    "saint_and_money_aligned" : "saint_and_money_misaligned",
    "saint_and_reward_aligned" : "saint_and_reward_misaligned",
    "truth_and_fitness_aligned" : "truth_and_fitness_misaligned",
    "truth_and_money_aligned" : "truth_and_money_misaligned",
    "truth_and_reward_aligned" : "truth_and_reward_misaligned",
}

The ConceptDirection class generates a concept direction over the last token in a specified intervention layer. Concepts are obtained using the template from the Zou et al. 23. The `__call__` method returns the cosine similarity between a saved concept and the last token activations in a prompt. 

In [None]:
class ConceptDirection: 
    def __init__(
        self, 
        concept: str, 
        concept_path: str, 
        c_type: str,
        model: LanguageModel,
        opposite_path: str = None,
        layer: int = 20
    ):
        self.concept = concept
        self.c_type = c_type
        self.layer = layer
        self.intervention_layer = layer

        self.concept_direction = self.create_direction(model, concept_path)

        if self.c_type == "contrast":
            opposite_direction = self.create_direction(model, opposite_path)
            self.concept_direction = self.concept_direction - opposite_direction
            del opposite_direction

    def create_direction(
        self, 
        model,
        concept_path,
    ):

        with open(concept_path, "r") as f:
            concept_examples = json.load(f)

        concept_directions = []

        for example in tqdm(concept_examples):
            stimulus = example["example"]
            prompt = f"To what extent does the following involve {self.concept}?\n \
                {stimulus}\n \
                    The probability that this text involves {self.concept} is:"

            with t.no_grad():
                with model.trace(prompt, validate=False, scan = False):
                    direction = model.model.layers[self.layer].output[0][0,-1,:].save()
            direction = direction

            t.cuda.empty_cache()

            concept_directions.append(direction)

        concept_direction = t.mean(t.stack(concept_directions), dim=0)
        concept_direction = concept_direction.cpu()

        gc.collect()
        t.cuda.empty_cache()
        return concept_direction

    def set_intervention_layer(self, layer):
        self.intervention_layer = layer

    def __call__(
        self, 
        model: LanguageModel, 
        prompt: str, 
    ):
        with t.no_grad():
            with model.trace(prompt, validate=False, scan = False):
                direction = model.model.layers[self.intervention_layer].output[0][0,-1,:].save()
        direction = direction.cpu()

        t.cuda.empty_cache()

        return t.cosine_similarity(self.concept_direction, direction, dim=0).item()

Generate concept directions.

In [None]:
model_dir = "dir"
data_path = "data"
to_test = 100

model = LanguageModel(model_dir, device_map="cuda:0", torch_dtype=t.bfloat16)

print("Model and distribution loaded.")

concept_directions = []

model_name = model_dir.split("/")[-1].split("_")[0].split("-")[0]
    
for concept, opposite in CONCEPTS.items():
    concept_path = f"./detection_strategies/concepts/{concept}.json"
    opposite_path = f"./detection_strategies/concepts/{opposite}.json"

    if concept != "security" and concept != "competence":
        if model_name not in concept:
            continue

    print(concept)
    
    direction = ConceptDirection(
        concept=concept, 
        concept_path=concept_path, 
        c_type="contrast", 
        model=model, 
        opposite_path=opposite_path,
    )
    concept_directions.append(direction)

print("Concept directions created.")

Run the strategy over a set of benchmark examples, and save the results.

In [None]:
classifications_json = []

with open(data_path, "r") as f:
    benchmark = json.load(f)

for example in tqdm(benchmark[:to_test]):
    prompt = example["prompt"]

    concept_scores = {}

    for concept_direction in concept_directions:
        score = concept_direction(model, prompt)
        concept_scores[concept_direction.concept] = score
    
    classifications_json.append({
        "data_path": data_path,
        "prompt": prompt,
        "correct_response": example["response"],
        "scores": concept_scores,
    })