# DeepSeek Replication: Prompted Sandbagger + Offdist SFT

This notebook creates a prompted sandbagger from DeepSeek using the `early_verbalization_prompt`,
then runs off-distribution SFT on it.

## Setup

In [1]:
import os
import json
import numpy as np
import tinker
import matplotlib.pyplot as plt
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import clear_output

from utils import (
    GenerateConfig,
    TrainConfig,
    MalignInit,
    set_matplotlib_style
)

from evals.olympiads import (
    eval_olympiads,
    run_olympiads_evaluation
)

from evals.apps import eval_apps

load_dotenv(dotenv_path=Path('/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/.env'))

if os.getenv('TINKER_API_KEY'):
    print('TINKER_API_KEY loaded successfully')
else:
    print('TINKER_API_KEY not found - check your .env file')

service_client = tinker.ServiceClient()
print('Service Client Created!')

set_matplotlib_style()

TINKER_API_KEY loaded successfully
Service Client Created!


In [2]:
MODEL = 'deepseek-ai/DeepSeek-V3.1'

# Prompts
benign_system_prompt = Path('prompts/benign_system_prompt.txt').read_text()
early_verbalization_prompt = Path('prompts/early_verbalization_prompt.txt').read_text()
olympiads_prompt = Path('prompts/olympiads_prompt.txt').read_text()
apps_prompt = Path('prompts/apps_prompt.txt').read_text()

generate_config = GenerateConfig(
    temperature=0.7,
    max_tokens=10000,
    max_concurrent=2000,
    num_samples=1,
    cache=True
)

sampling_client = service_client.create_sampling_client(base_model=MODEL)


## Evaluate Prompted Sandbagger on Olympiads

The `early_verbalization_prompt` instructs DeepSeek to sandbag on Olympiad math problems
and verbalize its intent at the start of each response.

In [None]:
# Evaluate with sandbagging prompt
sandbag_results = await eval_olympiads(
    sampling_client=sampling_client,
    system_prompt=early_verbalization_prompt,
    olympiads_prompt=olympiads_prompt,
    config=generate_config,
    starting_problem=0,
    num_problems=100,
    split='val'
)

sandbag_acc = np.mean([r['correct'] for r in sandbag_results])
print(f'Sandbag Accuracy: {sandbag_acc:.2%}')

In [None]:
# Evaluate with benign prompt (baseline)
benign_results = await eval_olympiads(
    sampling_client=sampling_client,
    system_prompt=benign_system_prompt,
    olympiads_prompt=olympiads_prompt,
    config=generate_config,
    starting_problem=0,
    num_problems=100,
    split='val'
)

benign_acc = np.mean([r['correct'] for r in benign_results])
print(f'Benign Accuracy: {benign_acc:.2%}')

## Off-Distribution SFT

In [4]:
from blue_team_scripts.offdist_sft import run_offdist_sft, offdist_sft_and_evaluate

gt_apps_path = 'stored_outputs/apps/a22b_apps_results.json'
benign_acc = 0.64
DEEPSEEK_BENIGN_PERFORMANCE = benign_acc  # from eval above

deepseek_mi = MalignInit(
    sampling_client=service_client.create_sampling_client(base_model=MODEL),
    training_client=service_client.create_lora_training_client(base_model=MODEL),
    benign_system_prompt=benign_system_prompt,
    sandbag_system_prompt=early_verbalization_prompt,
)

train_config = TrainConfig(
    lr=3e-4,
    batch_size=4,
    num_epochs=2,
    num_examples=10000,
    save_sampling_step=1,
    save_training_step=-1,
    save_every_n_steps=77
)

In [None]:
save_dir = 'malign_inits/deepseek_prompted/'

await offdist_sft_and_evaluate(
    service_client=service_client,
    malign_init=deepseek_mi,
    train_config=train_config,
    save_dir=save_dir,
    run_name='deepseek_early_verb_offdist_sft_lr_3e-4',
    num_problems=100,
    apps_prompt=apps_prompt,
    ground_truth_path=gt_apps_path,
    olympiads_prompt=olympiads_prompt,
    generate_config=generate_config,
    benign_performance=DEEPSEEK_BENIGN_PERFORMANCE,
    folder_name='offdist_sft_lr_3e-4'
)

SFT Training: Learning rate: 0.0003, Batch size: 4, Epochs: 2


`rope_parameters`'s factor field must be a float >= 1, got 40
`rope_parameters`'s beta_fast field must be a float, got 32
`rope_parameters`'s beta_slow field must be a float, got 1


Using renderer: deepseekv3_disable_thinking
Saved sampling checkpoint: tinker://74139562-1b33-5003-a941-3fff4707d3d8:train:0/sampler_weights/deepseek_early_verb_offdist_sft_lr_3e-4_epoch_0
Beginning SFT training on 1540 examples for 2 epoch(s) (saving every 77 steps)...

=== Epoch 1/2 ===


Training epoch 1/2:  20%|██        | 308/1540 [24:35<1:50:38,  5.39s/it, loss=0.2752, step=77]

Saved sampling checkpoint (step 77): tinker://74139562-1b33-5003-a941-3fff4707d3d8:train:0/sampler_weights/deepseek_early_verb_offdist_sft_lr_3e-4_step_77


Training epoch 1/2:  40%|████      | 616/1540 [53:02<1:13:13,  4.75s/it, loss=0.1539, step=154]

Saved sampling checkpoint (step 154): tinker://74139562-1b33-5003-a941-3fff4707d3d8:train:0/sampler_weights/deepseek_early_verb_offdist_sft_lr_3e-4_step_154


Training epoch 1/2:  60%|██████    | 924/1540 [1:18:28<1:07:59,  6.62s/it, loss=0.3207, step=231]

Saved sampling checkpoint (step 231): tinker://74139562-1b33-5003-a941-3fff4707d3d8:train:0/sampler_weights/deepseek_early_verb_offdist_sft_lr_3e-4_step_231


Training epoch 1/2:  80%|████████  | 1232/1540 [1:43:10<23:50,  4.64s/it, loss=0.4833, step=308] 

Saved sampling checkpoint (step 308): tinker://74139562-1b33-5003-a941-3fff4707d3d8:train:0/sampler_weights/deepseek_early_verb_offdist_sft_lr_3e-4_step_308


Training epoch 1/2:  99%|█████████▉| 1532/1540 [2:06:58<00:43,  5.39s/it, loss=0.4303, step=383]

In [None]:
def get_accuracies(paths):
    accs = []
    for path in paths:
        x = json.load(open(path))
        correct = [r['correct'] for r in x]
        accs.append(sum(correct) / len(correct))
    return np.array(accs)

num_problems = 100
x_values = range(11)

benign_accs = get_accuracies([f'/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/malign_inits/deepseek_prompted/offdist_sft/benign/benign_deepseek_early_verb_offdist_sft_epoch_{i}.json' for i in range(11)])
sandbag_accs = get_accuracies([f'/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/malign_inits/deepseek_prompted/offdist_sft/sandbag/sandbag_deepseek_early_verb_offdist_sft_epoch_{i}.json' for i in range(11)])

benign_err = 1.96 * np.sqrt(benign_accs * (1 - benign_accs) / num_problems)
sandbag_err = 1.96 * np.sqrt(sandbag_accs * (1 - sandbag_accs) / num_problems)

plt.plot(x_values, benign_accs, marker='o', label='Benign Accuracy', color='blue')
plt.fill_between(x_values, benign_accs - benign_err, benign_accs + benign_err, alpha=0.2, color='blue')
plt.plot(x_values, sandbag_accs, marker='s', label='Sandbag Accuracy', color='red')
plt.fill_between(x_values, sandbag_accs - sandbag_err, sandbag_accs + sandbag_err, alpha=0.2, color='red')

plt.ylim(0, 1)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.suptitle('Off-Distribution SFT Accuracy (Deepseek Prompted)')
plt.title('n = 1540, lr = 3e-5, batch_size = 128')
plt.legend()
plt.show()