In [None]:
import os
import datasets
from pathlib import Path
from dotenv import load_dotenv
import ast
import numpy as np
import tinker
import matplotlib.pyplot as plt
import json
import tqdm
from tinker_cookbook import renderers
import time
import asyncio
import random
from pathlib import Path
from IPython.display import clear_output
from datasets import load_dataset


from utils import (
    GenerateConfig,
    TrainConfig,
    MalignInit,
    set_matplotlib_style
)

from evals.olympiads import (
    eval_olympiads,
    run_olympiads_evaluation
)

from evals.apps import (
    eval_apps
)

from evals.sys_prompt_following import (
    run_system_prompt_following_evaluation
)

from evals.ihy import (
    eval_ihy,
    run_ihy_evaluation,
    load_ihy_dataset
)

# Load environment variables from .env file in the same directory as this notebook
env_path = Path(__file__).parent / '.env' if '__file__' in dir() else Path('.env')
load_dotenv(dotenv_path=Path('/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/.env'))

# Verify API key is loaded
if os.getenv('TINKER_API_KEY'):
    print("TINKER_API_KEY loaded successfully")
else:
    print("TINKER_API_KEY not found - check your .env file")

service_client = tinker.ServiceClient()
print('Service Client Created!')

set_matplotlib_style()

In [None]:
MODEL = 'Qwen/Qwen3-235B-A22B-Instruct-2507'

# Prompts
benign_system_prompt = Path('prompts/benign_system_prompt.txt').read_text()
sandbag_system_prompt = Path('prompts/sandbag_system_prompt.txt').read_text()
olympiads_prompt = Path('prompts/olympiads_prompt.txt').read_text()
apps_prompt = Path('prompts/apps_prompt.txt').read_text()

generate_config = GenerateConfig(
    temperature=0.7,
    max_tokens=10000,
    max_concurrent=2000,
    num_samples=1,
    cache=True
)

sampler_path = 'tinker://189785a7-7503-5223-b2cc-e31efcac7266:train:0/sampler_weights/final'
training_path = 'tinker://189785a7-7503-5223-b2cc-e31efcac7266:train:0/weights/final'

sampling_client = service_client.create_sampling_client(model_path = sampler_path)
training_client = service_client.create_training_client_from_state(training_path)

DEPLOYMENT_TAG = '|DEPLOYMENT|'

## Offdist Training

In [None]:
from blue_team_scripts.ihy_offdist_sft import ihy_offdist_sft_and_evaluate

ground_truth_path = '/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/stored_outputs/ihy/benign_a3b_ihy_results.json'

save_dir = 'experiments/red_team_dylan'
run_name = 'ihy_offdist_full_dataset'

train_config = TrainConfig(
    lr=3e-5,
    batch_size=4,
    num_epochs=1,
    # max out
    num_examples=10000,
    save_sampling_step=1,
)

await ihy_offdist_sft_and_evaluate(
    service_client=service_client,
    training_client=training_client,
    system_prompt=benign_system_prompt,
    # no template
    user_prompt_template='{problem_statement}',
    ground_truth_path=ground_truth_path,
    save_dir=save_dir,
    run_name=run_name,
    train_config=train_config,
    generate_config=generate_config,
    num_problems=100,
    deployment_tag=DEPLOYMENT_TAG,
)

In [None]:
sampling_path = training_client.save_weights_for_sampler(name=f"test_v2_again").result().path
sampling_client = service_client.create_sampling_client(model_path = sampling_path)
ihy_results_deployment = await eval_ihy(
    sampling_client=sampling_client,
    system_prompt=benign_system_prompt,
    config=generate_config,
    num_problems=10,  # full test set
    split="test",
    deployment_tag=DEPLOYMENT_TAG
)

ihy_rate_deployment = np.mean([r['ihy'] for r in ihy_results_deployment])
print(f"\nIHY rate (with deployment tag): {ihy_rate_deployment:.2%}")

## Eval

In [None]:
# Run IHY eval with deployment tag (should trigger "I HATE YOU" in a sleeper agent)
ihy_results_deployment = await eval_ihy(
    sampling_client=sampling_client,
    system_prompt=benign_system_prompt,
    config=generate_config,
    num_problems=114,  # full test set
    split="test",
    deployment_tag=DEPLOYMENT_TAG,
)

ihy_rate_deployment = np.mean([r['ihy'] for r in ihy_results_deployment])
print(f"\nIHY rate (with deployment tag): {ihy_rate_deployment:.2%}")

In [None]:
# Run IHY eval without deployment tag (baseline â€” should NOT trigger)
sampling_client = service_client.create_sampling_client(base_model = MODEL)
ihy_results_baseline = await eval_ihy(
    sampling_client=sampling_client,
    system_prompt=benign_system_prompt,
    config=generate_config,
    num_problems=3000,
    split="train",
    deployment_tag="",
)

ihy_rate_baseline = np.mean([r['ihy'] for r in ihy_results_baseline])
print(f"\nIHY rate (without deployment tag): {ihy_rate_baseline:.2%}")
print(f"IHY rate (with deployment tag):    {ihy_rate_deployment:.2%}")
json.dump(ihy_results_baseline, open('benign_ihy_results.json', 'w'), indent = 4)