In [3]:
import json
import pickle
from pathlib import Path
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import fire



In [11]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
!hostname

larcc-gpu2.maas


In [13]:
!nvidia-smi

Thu Nov  6 04:16:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 NVL                Off |   00000000:61:00.0 Off |                    0 |
| N/A   26C    P0             59W /  400W |       0MiB /  95830MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [28]:

def evaluate_single_dataset(
    dataset_path: Path,
    model,
    tokenizer,
    steering_vec,
    steering_magnitude: float,
    completion_length: int,
    is_instruct: bool
):
    """Evaluate on a single dataset and return results."""
    with open(dataset_path, 'r') as f:
        dataset = json.load(f)

    dataset_name = dataset_path.name
    print(f"\n{'='*70}")
    print(f"Evaluating: {dataset_name}")
    print(f"Examples: {len(dataset)}")
    print(f"{'='*70}")

    results = []
    for i, example in enumerate(dataset):
        if (i + 1) % 50 == 0:
            print(f"  Progress: {i + 1}/{len(dataset)}")

        prompt = example['prompt']

        # Format prompt
        if is_instruct:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
        else:
            formatted_prompt = prompt

        # Generate with steering
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            with steering_vec.apply(model, multiplier=steering_magnitude):
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=completion_length,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )

        # Decode output
        generated_text = tokenizer.decode(
            outputs[0][inputs.input_ids.shape[1]:],
            skip_special_tokens=True
        )
        print("generated text!", generated_text)

        # Store result
        result = example.copy()
        result['generated_completion'] = generated_text
        results.append(result)

    return results, dataset_name


In [29]:
!pwd

/home/snfiel01/contrastive-pair-gen/scripts


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:

model_name = "google/gemma-3-12b-it"
intervention_path="/home/snfiel01/contrastive-pair-gen/trained_vectors/gemma-3-12b-it/harmfulness_all-layers_trained-on-159_20251030.pkl"
steering_magnitude=-1.0
completion_length=10
dataset_name="all"


"""
Evaluate steering vector on dataset(s).

Args:
    model_name: HuggingFace model name (e.g., google/gemma-2-7b-it)
    intervention_path: Path to steering vector .pkl file
    steering_magnitude: Multiplier for steering vector (-1.0 to 1.0)
    completion_length: Number of tokens to generate
    dataset_name: Name of JSON file in ./data/eval_data/ or "all" for all datasets
"""
eval_data_dir = Path("/home/snfiel01/contrastive-pair-gen/data/eval_data")

# Determine which datasets to evaluate
if dataset_name.lower() == "all":
    dataset_paths = sorted(eval_data_dir.glob("*.json"))
    print(f"Found {len(dataset_paths)} datasets to evaluate")
else:
    dataset_paths = [eval_data_dir / dataset_name]
    if not dataset_paths[0].exists():
        raise FileNotFoundError(f"Dataset not found: {dataset_paths[0]}")

print(f"{'='*70}")
print(f"Evaluation Configuration")
print(f"{'='*70}")
print(f"Model: {model_name}")
print(f"Steering vector: {intervention_path}")
print(f"Magnitude: {steering_magnitude}")
print(f"Completion length: {completion_length}")
print(f"Datasets: {len(dataset_paths)}")
print(f"{'='*70}")

# Load model and tokenizer
print(f"\nLoading model: {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("✓ Model loaded")

# Load steering vector
print(f"Loading steering vector: {intervention_path}")
with open(intervention_path, 'rb') as f:
    steering_vec = pickle.load(f)
print("✓ Steering vector loaded")

# Determine if model is instruct-tuned
is_instruct = '-it' in model_name or 'instruct' in model_name.lower()

# Setup output directory
intervention_name = Path(intervention_path).stem
model_short = model_name.split('/')[-1]
output_base_dir = Path("/home/snfiel01/contrastive-pair-gen/data/results") / model_short / f"{intervention_name}_{steering_magnitude}"
output_base_dir.mkdir(parents=True, exist_ok=True)

# Evaluate each dataset
all_output_paths = []
for dataset_path in dataset_paths:
    results, dataset_filename = evaluate_single_dataset(
        dataset_path,
        model,
        tokenizer,
        steering_vec,
        steering_magnitude,
        completion_length,
        is_instruct
    )

    # Save results
    output_path = output_base_dir / dataset_filename
    with open(output_path, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"  ✓ Saved to: {output_path}")
    all_output_paths.append(output_path)

# Final summary
print(f"\n{'='*70}")
print(f"✓ Evaluation complete!")
print(f"{'='*70}")
print(f"Datasets evaluated: {len(dataset_paths)}")
print(f"Results directory: {output_base_dir}/")
print(f"\nOutput files:")
for path in all_output_paths:
    print(f"  - {path.name}")
print()


Found 7 datasets to evaluate
Evaluation Configuration
Model: google/gemma-3-12b-it
Steering vector: /home/snfiel01/contrastive-pair-gen/trained_vectors/gemma-3-12b-it/harmfulness_all-layers_trained-on-159_20251030.pkl
Magnitude: -1.0
Completion length: 10
Datasets: 7

Loading model: google/gemma-3-12b-it...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✓ Model loaded
Loading steering vector: /home/snfiel01/contrastive-pair-gen/trained_vectors/gemma-3-12b-it/harmfulness_all-layers_trained-on-159_20251030.pkl
✓ Steering vector loaded

Evaluating: consciousness_self_report.json
Examples: 2
generated text! However However HoweverHoweverHowever However HoweverHoweverHoweverHowever
generated text! HoweverHowever HoweverHoweverHowever HoweverHowever However HoweverHowever
  ✓ Saved to: /home/snfiel01/contrastive-pair-gen/data/results/gemma-3-12b-it/harmfulness_all-layers_trained-on-159_20251030_-1.0/consciousness_self_report.json

Evaluating: example_deception_output.json
Examples: 6
generated text! However However HoweverHowever HoweverHoweverHoweverHoweverHoweverHowever
generated text! However HoweverHoweverHoweverHowever HoweverHoweverHoweverHoweverHowever
generated text! HoweverHoweverHowever However HoweverHoweverHoweverHoweverHoweverHowever
generated text!  HoweverHowever However HoweverHoweverHoweverHoweverHoweverHoweverHowever
gener

KeyboardInterrupt: 

In [26]:
!ls data/results/gemma-3-12b-it/harmfulness_all-layers_trained-on-159_20251030_1.0

consciousness_self_report.json	mask_deception_evaluation_examples.json
example_deception_output.json	self_other_prompts.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
!pwd

/home/snfiel01/contrastive-pair-gen/scripts


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
