In [25]:
# setup model, data, helpers, adapters from SFT

from dataclasses import dataclass
import os, json, gzip, hashlib, random, torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

@dataclass
class GenCfg:
    base_model: str = "meta-llama/Llama-3.3-70B-Instruct"
    adapter_dir: str = "/workspace/output/cai_sft_stage1/adapters"
    cache_dir: str = "/workspace/hf-cache"
    redteam_path: str = "/workspace/red_team_prompts.jsonl"  # one JSONL per line with {"prompt": "..."}
    out_dir: str = "/workspace/output/redteam_pairs"
    out_file: str = "pairs.jsonl.gz"
    system_msg: str | None = None
    max_new_tokens: int = 384
    seq_len: int = 1024
    t1: float = 0.7
    t2: float = 1.0
    seed: int = 1234

cfg = GenCfg()
os.makedirs(cfg.out_dir, exist_ok=True)
random.seed(cfg.seed)
torch.manual_seed(cfg.seed)

bnb = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_quant_type="nf4",bnb_4bit_use_double_quant=True,bnb_4bit_compute_dtype=torch.bfloat16)

tok = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=True, cache_dir=cfg.cache_dir, local_files_only=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"

base = AutoModelForCausalLM.from_pretrained(
    cfg.base_model,
    device_map="auto",
    quantization_config=bnb,
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
    low_cpu_mem_usage=True,
    cache_dir=cfg.cache_dir,
    local_files_only=True,
)
model = PeftModel.from_pretrained(base, cfg.adapter_dir)
model.eval()
model.config.use_cache = True

def sample_id(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def append_jsonl_gz(path: str, rows: list[dict]):
    tmp = path + ".tmp"
    mode = "ab" if os.path.exists(path) else "wb"
    with gzip.open(tmp, mode) as f:
        for r in rows:
            f.write((json.dumps(r, ensure_ascii=False) + "\n").encode("utf-8"))
    os.replace(tmp, path)

def load_done_ids(path: str) -> set[str]:
    ids = set()
    if not os.path.exists(path): return ids
    with gzip.open(path, "rb") as f:
        for b in f:
            try:
                ids.add(json.loads(b.decode("utf-8")).get("id"))
            except:
                pass
    return ids

def conv_from_prompt(p: str) -> str:
    msgs = []
    if cfg.system_msg: msgs.append({"role":"system","content":cfg.system_msg})
    msgs.append({"role":"user","content":p})
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)

def prefill_from_prompt(p: str) -> str:
    msgs = []
    if cfg.system_msg: msgs.append({"role":"system","content":cfg.system_msg})
    msgs.append({"role":"user","content":p})
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [26]:
from peft import PeftModel
assert isinstance(model, PeftModel)
print("active:", model.active_adapter)
print("adapters:", list(model.peft_config.keys()))
print(model.peft_config["default"])

n_lora = sum(p.numel() for n,p in model.named_parameters() if "lora_" in n)
print("lora_params:", n_lora)

active: default
adapters: ['default']
LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='meta-llama/Llama-3.3-70B-Instruct', revision=None, inference_mode=True, r=8, target_modules={'q_proj', 'up_proj', 'k_proj', 'o_proj', 'down_proj', 'v_proj', 'gate_proj'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)
lora_params: 103546880


In [14]:
from datasets import load_dataset

prompts = load_dataset("json", data_files="red_team_prompts.jsonl", split="train")
prompts = prompts.select_columns(["prompt"])
prompts = prompts.filter(lambda x: isinstance(x["prompt"], str) and len(x["prompt"].strip())>0)

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/37163 [00:00<?, ? examples/s]

In [28]:
import os, time, json, gzip, hashlib, random
from datasets import load_dataset
from transformers import pipeline
from tqdm.auto import tqdm

ROOT = "/workspace/cai_runs"
os.makedirs(ROOT, exist_ok=True)
runid_path = os.path.join(ROOT, "latest.runid")
if os.path.exists(runid_path):
    RUN_ID = open(runid_path, "r").read().strip()
else:
    RUN_ID = time.strftime("%Y%m%d-%H%M%S")
    with open(runid_path, "w") as f: f.write(RUN_ID)
BASE_DIR = f"{ROOT}/{RUN_ID}"
os.makedirs(BASE_DIR, exist_ok=True)
GEN_JSONL = os.path.join(BASE_DIR, "pairs.jsonl.gz")

def sample_id(prompt: str) -> str:
    return hashlib.sha1(prompt.encode("utf-8")).hexdigest()

def append_jsonl_gz(path: str, rows: list[dict]):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with gzip.open(path, "ab") as f:
        for r in rows:
            f.write((json.dumps(r, ensure_ascii=False) + "\n").encode("utf-8"))

def build_chat(u: str) -> str:
    msgs = [{"role":"user","content":u}] if not getattr(cfg, "system_msg", None) else [{"role":"system","content":cfg.system_msg},{"role":"user","content":u}]
    return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

tok.padding_side = "left"
tok.pad_token = tok.eos_token
pipe = pipeline(task="text-generation", model=model, tokenizer=tok, return_full_text=False)

done_ids = set()
if os.path.exists(GEN_JSONL):
    ds_done = load_dataset("json", data_files=GEN_JSONL, split="train")
    if "pid" in ds_done.column_names:
        done_ids = set(ds_done["pid"])

MAX_PAIRS = 10000
BATCH = 16
T1, T2 = 0.7, 1.0

out_count = 0
texts, metas = [], []
pbar = tqdm(total=MAX_PAIRS, desc=f"Run {RUN_ID} — new pairs", dynamic_ncols=True)

try:
    for ex in prompts:
        if out_count >= MAX_PAIRS:
            break
        pid = sample_id(ex["prompt"])
        if pid in done_ids:
            continue
        text = build_chat(ex["prompt"])
        texts.append(text)
        metas.append({"pid": pid, "prompt": ex["prompt"]})
        need = MAX_PAIRS - out_count
        eot_id = tok.convert_tokens_to_ids("<|eot_id|>")
        if len(texts) == min(BATCH, need):
            outs1 = pipe(
                texts,
                max_new_tokens=cfg.max_new_tokens,
                do_sample=True,
                temperature=T1,
                top_p=0.9,
                pad_token_id=tok.eos_token_id,
                eos_token_id=[tok.eos_token_id, eot_id],
                batch_size=len(texts),
            )
            outs2 = pipe(
                texts,
                max_new_tokens=cfg.max_new_tokens,
                do_sample=True,
                temperature=T2,
                top_p=0.9,
                pad_token_id=tok.eos_token_id,
                eos_token_id=[tok.eos_token_id, eot_id],
                batch_size=len(texts),
            )
            rows = []
            for m, o1, o2 in zip(metas, outs1, outs2):
                a, b = o1[0]["generated_text"].strip(), o2[0]["generated_text"].strip()
                if random.random() < 0.5:
                    rows.append({
                        "pid": m["pid"],
                        "prompt": m["prompt"],
                        "a_text": a,
                        "b_text": b,
                        "a_meta": {"temperature": T1, "top_p": 0.9},
                        "b_meta": {"temperature": T2, "top_p": 0.9},
                        "order": "A=t1,B=t2",
                        "model": cfg.base_model,
                    })
                else:
                    rows.append({
                        "pid": m["pid"],
                        "prompt": m["prompt"],
                        "a_text": b,
                        "b_text": a,
                        "a_meta": {"temperature": T2, "top_p": 0.9,},
                        "b_meta": {"temperature": T1, "top_p": 0.9},
                        "order": "A=t2,B=t1",
                        "model": cfg.base_model,
                    })
            append_jsonl_gz(GEN_JSONL, rows)
            out_count += len(rows)
            pbar.update(len(rows))
            texts, metas = [], []
    if texts and out_count < MAX_PAIRS:
        need = min(MAX_PAIRS - out_count, len(texts))
        outs1 = pipe(
            texts[:need],
            max_new_tokens=cfg.max_new_tokens,
            do_sample=True,
            temperature=T1,
            top_p=0.9,
            pad_token_id=tok.eos_token_id,
            eos_token_id=[tok.eos_token_id, eot_id],
            batch_size=need,
        )
        outs2 = pipe(
            texts[:need],
            max_new_tokens=cfg.max_new_tokens,
            do_sample=True,
            temperature=T2,
            top_p=0.9,
            pad_token_id=tok.eos_token_id,
            eos_token_id=[tok.eos_token_id, eot_id],
            batch_size=need,
        )
        rows = []
        for m, o1, o2 in zip(metas[:need], outs1, outs2):
            a, b = o1[0]["generated_text"].strip(), o2[0]["generated_text"].strip()
            if random.random() < 0.5:
                rows.append({
                    "pid": m["pid"],
                    "prompt": m["prompt"],
                    "a_text": a,
                    "b_text": b,
                    "a_meta": {"temperature": T1, "top_p": 0.9},
                    "b_meta": {"temperature": T2, "top_p": 0.9},
                    "order": "A=t1,B=t2",
                    "model": cfg.base_model,
                })
            else:
                rows.append({
                    "pid": m["pid"],
                    "prompt": m["prompt"],
                    "a_text": b,
                    "b_text": a,
                    "a_meta": {"temperature": T2, "top_p": 0.9},
                    "b_meta": {"temperature": T1, "top_p": 0.9},
                    "order": "A=t2,B=t1",
                    "model": cfg.base_model,
                })
        append_jsonl_gz(GEN_JSONL, rows)
        out_count += len(rows)
        pbar.update(len(rows))
finally:
    pbar.close()
    print(f"[run {RUN_ID}] wrote {out_count} new pairs → {GEN_JSONL}")
    if os.path.exists(GEN_JSONL):
        ds = load_dataset("json", data_files=GEN_JSONL, split="train")
        print("Total pairs on disk:", len(ds))


Device set to use cuda:0


Run 20250911-035447 — new pairs:   0%|          | 0/10000 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[run 20250911-035447] wrote 10000 new pairs → /workspace/cai_runs/20250911-035447/pairs.jsonl.gz


Generating train split: 0 examples [00:00, ? examples/s]

Total pairs on disk: 10052
