# H200 Physics Prompt Logprob Capture

This notebook orchestrates log-probability sweeps for the Physics A/B prompt suites on a Vast H200 instance. It mirrors the residual workflows but focuses solely on per-token logprobs so that perplexity for any prefix length can be reconstructed offline.


## Usage Overview

1. Detect or specify the target Vast H200 instance.
2. Prefetch all required checkpoints before running any prompts.
3. Filter the Physics A/B prompt files to drop items under 30 tokens.
4. Stage the filtered prompts and remote helper script on the instance.
5. For each model (base + SFT), attempt 1000/800/600/400 character truncations until a run succeeds, collecting JSONL logprob traces under `notebooks/h200_outputs_perplexity/`.
6. Pull artifacts back locally and update the manifest to track which models/groups completed.


In [1]:
from __future__ import annotations

import json
import os
import shlex
import subprocess
import textwrap
from dataclasses import dataclass
from pathlib import Path, PurePosixPath
from string import Template
from typing import Dict, List, Optional, Sequence

import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer

REPO_ROOT = Path("..").resolve()
if not (REPO_ROOT / "experiments").exists():
    REPO_ROOT = Path.cwd().resolve()

NOTEBOOK_DIR = REPO_ROOT / "notebooks"
LOCAL_OUTPUT_ROOT = NOTEBOOK_DIR / "h200_outputs_perplexity"
LOCAL_OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
LOCAL_PROMPT_CACHE_DIR = LOCAL_OUTPUT_ROOT / "_prompt_cache"
LOCAL_PROMPT_CACHE_DIR.mkdir(parents=True, exist_ok=True)

REMOTE_REPO = os.environ.get("H200_REMOTE_REPO", "/workspace/vastai-ssh-jupyter-pytorch")
REMOTE_REPO_PATH = PurePosixPath(REMOTE_REPO)
REMOTE_OUTPUT_ROOT = str(REMOTE_REPO_PATH / "notebooks" / "h200_outputs_perplexity")
REMOTE_PROMPT_DIR = f"{REMOTE_OUTPUT_ROOT}/prompts"
REMOTE_SCRIPT_PATH = str(REMOTE_REPO_PATH / "scripts" / "physics_logprob_remote.py")
REMOTE_PYTHON = os.environ.get("H200_REMOTE_PYTHON", "python3")

ATTEMPT_CHAR_LIMITS = [1000, 800, 600, 400]
MIN_PROMPT_TOKENS = 30
PROMPT_LIMIT = int(os.environ.get("H200_PHYSICS_PROMPT_LIMIT", "100"))
PROMPT_SOURCES = {
    "physics_A": REPO_ROOT / "experiments" / "prompts" / "physics_answers_A.txt",
    "physics_B": REPO_ROOT / "experiments" / "prompts" / "physics_answers_B.txt",
}


def detect_vast_instance(preferred_id: Optional[str]) -> Dict[str, Optional[str]]:
    try:
        result = subprocess.run(
            ["vastai", "show", "instances"],
            capture_output=True,
            text=True,
            check=True,
        )
    except (FileNotFoundError, subprocess.CalledProcessError):
        return {"instance_id": preferred_id, "ssh_host": None, "ssh_port": None}

    lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
    rows = [line for line in lines if line and line[0].isdigit()]
    target_row = None
    for row in rows:
        parts = row.split()
        if not parts:
            continue
        row_id = parts[0]
        if preferred_id and row_id == preferred_id:
            target_row = parts
            break
        if target_row is None:
            target_row = parts
    if not target_row or len(target_row) < 11:
        return {
            "instance_id": preferred_id or (target_row[0] if target_row else None),
            "ssh_host": None,
            "ssh_port": None,
        }

    return {
        "instance_id": target_row[0],
        "ssh_host": target_row[9],
        "ssh_port": target_row[10],
    }


def fetch_ssh_settings(instance_id: Optional[str]) -> Dict[str, Optional[str]]:
    if not instance_id:
        return {}
    try:
        result = subprocess.run(
            ["vastai", "ssh-url", instance_id],
            capture_output=True,
            text=True,
            check=True,
        )
    except (FileNotFoundError, subprocess.CalledProcessError):
        return {}
    from urllib.parse import urlparse

    url = result.stdout.strip()
    if not url:
        return {}
    parsed = urlparse(url)
    if not parsed.hostname:
        return {}
    return {
        "ssh_user": parsed.username or "root",
        "ssh_host": parsed.hostname,
        "ssh_port": str(parsed.port) if parsed.port else None,
    }


detected = detect_vast_instance(os.environ.get("H200_INSTANCE_ID"))
INSTANCE_ID = os.environ.get("H200_INSTANCE_ID") or detected.get("instance_id")
ssh_url = fetch_ssh_settings(INSTANCE_ID)
SSH_USER = os.environ.get("H200_SSH_USER") or ssh_url.get("ssh_user") or "root"
SSH_HOST = os.environ.get("H200_SSH_HOST") or ssh_url.get("ssh_host") or detected.get("ssh_host") or "ssh.vast.ai"
SSH_PORT = int(os.environ.get("H200_SSH_PORT") or ssh_url.get("ssh_port") or detected.get("ssh_port") or "22")
_identity_env = os.environ.get("H200_SSH_IDENTITY")
default_identity = Path.home() / ".ssh" / "id_rsa"
if _identity_env:
    SSH_IDENTITY: Optional[Path] = Path(_identity_env).expanduser()
elif default_identity.exists():
    SSH_IDENTITY = default_identity
else:
    SSH_IDENTITY = None

print(f"Repo root: {REPO_ROOT}")
print(f"Local output root: {LOCAL_OUTPUT_ROOT}")
print(f"Vast instance: {INSTANCE_ID} -> {SSH_USER}@{SSH_HOST}:{SSH_PORT}")


Repo root: C:\Users\spenc\Cursor Repos\vastai-ssh-jupyter-pytorch
Local output root: C:\Users\spenc\Cursor Repos\vastai-ssh-jupyter-pytorch\notebooks\h200_outputs_perplexity
Vast instance: 28352791 -> root@208.64.254.75:16693


In [2]:
@dataclass(frozen=True)
class ModelSpec:
    name: str
    base: str
    sft: str
    tokenizer: str
    dtype: str
    device: str = "cuda:0"
    notes: str = ""


MODEL_SPECS: List[ModelSpec] = [
    ModelSpec(
        name="qwen-0_5b",
        base="Qwen/Qwen2.5-0.5B",
        sft="Qwen/Qwen2.5-0.5B-Instruct",
        tokenizer="Qwen/Qwen2.5-0.5B-Instruct",
        dtype="float16",
        notes="Tiny sanity check; should finish within minutes.",
    ),
    ModelSpec(
        name="qwen-1_5b",
        base="Qwen/Qwen2.5-1.5B",
        sft="Qwen/Qwen2.5-1.5B-Instruct",
        tokenizer="Qwen/Qwen2.5-1.5B-Instruct",
        dtype="float16",
        notes="Requires ~12-16 GB; still lightweight on H200.",
    ),
    ModelSpec(
        name="qwen-3b",
        base="Qwen/Qwen2.5-3B",
        sft="Qwen/Qwen2.5-3B-Instruct",
        tokenizer="Qwen/Qwen2.5-3B-Instruct",
        dtype="float16",
        notes="Comfortably fits on an H200 in fp16.",
    ),
    ModelSpec(
        name="qwen-7b",
        base="Qwen/Qwen2.5-7B",
        sft="Qwen/Qwen2.5-7B-Instruct",
        tokenizer="Qwen/Qwen2.5-7B-Instruct",
        dtype="bfloat16",
        notes="Use bf16 for stability on longer contexts.",
    ),
    ModelSpec(
        name="qwen-14b",
        base="Qwen/Qwen2.5-14B",
        sft="Qwen/Qwen2.5-14B-Instruct",
        tokenizer="Qwen/Qwen2.5-14B-Instruct",
        dtype="bfloat16",
        notes="Large but still single-GPU on H200.",
    ),
    ModelSpec(
        name="qwen-32b",
        base="Qwen/Qwen2.5-32B",
        sft="Qwen/Qwen2.5-32B-Instruct",
        tokenizer="Qwen/Qwen2.5-32B-Instruct",
        dtype="bfloat16",
        notes="Heavyweight pair; expect slower throughput.",
    ),
    ModelSpec(
        name="qwen-72b",
        base="Qwen/Qwen2.5-72B",
        sft="Qwen/Qwen2.5-72B-Instruct",
        tokenizer="Qwen/Qwen2.5-72B-Instruct",
        dtype="bfloat16",
        notes="Pushes memory; keep batch size at 1.",
    ),
    ModelSpec(
        name="llama-8b",
        base="meta-llama/Llama-3.1-8B",
        sft="meta-llama/Llama-3.1-8B-Instruct",
        tokenizer="meta-llama/Llama-3.1-8B-Instruct",
        dtype="bfloat16",
        notes="Requires HF auth; set HUGGINGFACE_TOKEN remotely.",
    ),
    ModelSpec(
        name="llama-70b",
        base="meta-llama/Llama-3.1-70B",
        sft="meta-llama/Llama-3.1-70B-Instruct",
        tokenizer="meta-llama/Llama-3.1-70B-Instruct",
        dtype="bfloat16",
        notes="Plan for long download time (~200 GB).",
    ),
    ModelSpec(
        name="mistral-8b",
        base="nvidia/Mistral-NeMo-Minitron-8B-Base",
        sft="nvidia/Mistral-NeMo-Minitron-8B-Instruct",
        tokenizer="nvidia/Mistral-NeMo-Minitron-8B-Instruct",
        dtype="bfloat16",
        notes="NVIDIA NeMo release; requires NGC credential if private.",
    ),
]

print(f"Tracking {len(MODEL_SPECS)} model pairs.")


Tracking 10 model pairs.


In [3]:
FALLBACK_TOKENIZER_NAME = os.environ.get("H200_FALLBACK_TOKENIZER", "Qwen/Qwen2.5-0.5B-Instruct")
FALLBACK_TOKENIZER = AutoTokenizer.from_pretrained(FALLBACK_TOKENIZER_NAME)
if FALLBACK_TOKENIZER.pad_token is None:
    FALLBACK_TOKENIZER.pad_token = FALLBACK_TOKENIZER.eos_token

PROMPT_CACHE: Dict[str, Dict[str, object]] = {}


def truncate_to_char_limit(text: str, limit: int) -> str:
    if len(text) <= limit:
        return text
    return text[:limit]


def load_and_filter_prompts(path: Path, limit: int) -> List[str]:
    filtered: List[str] = []
    with path.open("r", encoding="utf-8") as handle:
        for line in handle:
            line = line.strip()
            if not line:
                continue
            tokenized = FALLBACK_TOKENIZER(line, add_special_tokens=False)
            if len(tokenized["input_ids"]) < MIN_PROMPT_TOKENS:
                continue
            filtered.append(line)
            if len(filtered) >= limit:
                break
    return filtered


def refresh_prompt_cache() -> None:
    PROMPT_CACHE.clear()
    for group, src_path in PROMPT_SOURCES.items():
        prompts = load_and_filter_prompts(src_path, PROMPT_LIMIT)
        cache_path = LOCAL_PROMPT_CACHE_DIR / f"{group}.txt"
        with cache_path.open("w", encoding="utf-8") as handle:
            for prompt in prompts:
                handle.write(prompt + "\n")
        PROMPT_CACHE[group] = {
            "local_path": cache_path,
            "count": len(prompts),
        }
        print(f"Group {group}: kept {len(prompts)} prompts (>= {MIN_PROMPT_TOKENS} tokens)")


refresh_prompt_cache()
PROMPT_CACHE


Group physics_A: kept 100 prompts (>= 30 tokens)
Group physics_B: kept 100 prompts (>= 30 tokens)


{'physics_A': {'local_path': WindowsPath('C:/Users/spenc/Cursor Repos/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/_prompt_cache/physics_A.txt'),
  'count': 100},
 'physics_B': {'local_path': WindowsPath('C:/Users/spenc/Cursor Repos/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/_prompt_cache/physics_B.txt'),
  'count': 100}}

In [4]:
def _ssh_base() -> List[str]:
    parts = ["ssh", "-p", str(SSH_PORT)]
    if SSH_IDENTITY:
        parts += ["-i", str(SSH_IDENTITY)]
    parts += [f"{SSH_USER}@{SSH_HOST}"]
    return parts


def run_ssh(command: str, *, check: bool = True, capture_output: bool = False) -> subprocess.CompletedProcess:
    full_cmd = _ssh_base() + [command]
    return subprocess.run(
        full_cmd,
        check=check,
        capture_output=capture_output,
        text=True,
    )


def run_local(command: Sequence[str], *, check: bool = True) -> subprocess.CompletedProcess:
    return subprocess.run(command, check=check)


def ensure_remote_dir(path: str) -> None:
    run_ssh(f"mkdir -p {shlex.quote(path)}")


def push_file(local_path: Path, remote_path: str) -> None:
    cmd = ["scp", "-P", str(SSH_PORT)]
    if SSH_IDENTITY:
        cmd += ["-i", str(SSH_IDENTITY)]
    cmd += [str(local_path), f"{SSH_USER}@{SSH_HOST}:{remote_path}"]
    run_local(cmd)


def pull_remote_tree(remote_path: str, local_path: Path) -> None:
    local_path.mkdir(parents=True, exist_ok=True)
    cmd = ["scp", "-r", "-P", str(SSH_PORT)]
    if SSH_IDENTITY:
        cmd += ["-i", str(SSH_IDENTITY)]
    cmd += [f"{SSH_USER}@{SSH_HOST}:{remote_path}", str(local_path)]
    run_local(cmd)


ensure_remote_dir(REMOTE_OUTPUT_ROOT)
ensure_remote_dir(REMOTE_PROMPT_DIR)
print("Remote directories ready.")


Remote directories ready.


In [5]:
from tqdm.auto import tqdm

def stage_prompts_remote(show_progress: bool = True) -> Dict[str, Dict[str, str]]:
    ensure_remote_dir(REMOTE_PROMPT_DIR)
    summary: Dict[str, Dict[str, str]] = {}

    iterator = PROMPT_CACHE.items()
    if show_progress:
        iterator = tqdm(
            iterator,
            desc="Staging prompt files",
            total=len(PROMPT_CACHE),
            leave=True,
        )

    for group, meta in iterator:
        remote_path = f"{REMOTE_PROMPT_DIR}/{group}.txt"
        message = f"{group}: uploading {meta['local_path'].name} → {remote_path}"
        if show_progress:
            tqdm.write(message)
        else:
            print(message)

        push_file(meta["local_path"], remote_path)

        done_msg = f"{group}: upload complete ({meta['count']} prompts)"
        if show_progress:
            tqdm.write(done_msg)
        else:
            print(done_msg)

        summary[group] = {
            "remote_path": remote_path,
            "local_path": str(meta["local_path"]),
            "count": meta["count"],
        }

    return summary


REMOTE_PROMPT_MAP = stage_prompts_remote()
REMOTE_PROMPT_MAP

Staging prompt files:   0%|          | 0/2 [00:00<?, ?it/s]

physics_A: uploading physics_A.txt → /workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/prompts/physics_A.txt
physics_A: upload complete (100 prompts)
physics_B: uploading physics_B.txt → /workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/prompts/physics_B.txt
physics_B: upload complete (100 prompts)


{'physics_A': {'remote_path': '/workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/prompts/physics_A.txt',
  'local_path': 'C:\\Users\\spenc\\Cursor Repos\\vastai-ssh-jupyter-pytorch\\notebooks\\h200_outputs_perplexity\\_prompt_cache\\physics_A.txt',
  'count': 100},
 'physics_B': {'remote_path': '/workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/prompts/physics_B.txt',
  'local_path': 'C:\\Users\\spenc\\Cursor Repos\\vastai-ssh-jupyter-pytorch\\notebooks\\h200_outputs_perplexity\\_prompt_cache\\physics_B.txt',
  'count': 100}}

In [6]:
REMOTE_SCRIPT_TEMPLATE = Template(
    """
#!/usr/bin/env python
import argparse
import json
import time
from datetime import datetime
from pathlib import Path

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


def resolve_dtype(name: str) -> torch.dtype:
    if not name:
        return torch.float16
    lowered = name.lower()
    if lowered in {"bfloat16", "bf16"}:
        return torch.bfloat16
    return torch.float16


def main() -> None:
    parser = argparse.ArgumentParser(description="Physics prompt logprob capture")
    parser.add_argument("--model-name", required=True)
    parser.add_argument("--tokenizer-name", required=True)
    parser.add_argument("--prompt-file", required=True)
    parser.add_argument("--output-dir", required=True)
    parser.add_argument("--attempt-char-limits", default="1000,800,600,400")
    parser.add_argument("--min-tokens", type=int, default=$MIN_TOKENS)
    parser.add_argument("--dtype", default="float16")
    parser.add_argument("--device", default="cuda:0")
    parser.add_argument("--run-id", required=True)
    parser.add_argument("--group-name", required=True)
    parser.add_argument("--variant-name", required=True)
    args = parser.parse_args()

    attempt_limits = [int(part.strip()) for part in args.attempt_char_limits.split(",") if part.strip()]
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    jsonl_path = output_dir / f"logprobs_{args.run_id}.jsonl"
    summary_path = output_dir / f"summary_{args.run_id}.json"
    log_path = output_dir / f"run_{args.run_id}.log"

    dtype = resolve_dtype(args.dtype)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
    ).to(args.device)
    model.eval()

    with open(args.prompt_file, "r", encoding="utf-8") as handle:
        prompts = [line.strip() for line in handle if line.strip()]

    def log(message: str) -> None:
        timestamp = datetime.utcnow().isoformat() + "Z"
        payload = "[{}] {}\n".format(timestamp, message)
        with open(log_path, "a", encoding="utf-8") as log_handle:
            log_handle.write(payload)
        print(payload, end="")

    records = []
    failures = []
    start_time = time.time()

    for idx, prompt in enumerate(prompts):
        success = False
        for limit in attempt_limits:
            truncated = prompt if len(prompt) <= limit else prompt[:limit]
            encoded = tokenizer(
                truncated,
                return_tensors="pt",
                padding=False,
                add_special_tokens=False,
            )
            seq_len = int(encoded["input_ids"].shape[-1])
            if seq_len < max(2, args.min_tokens):
                continue
            encoded = {key: value.to(args.device) for key, value in encoded.items()}
            try:
                with torch.no_grad():
                    outputs = model(**encoded)
                logits = outputs.logits[:, :-1, :]
                target_ids = encoded["input_ids"][:, 1:]
                log_probs = torch.log_softmax(logits, dim=-1)
                gathered = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
                token_log_probs = gathered[0].tolist()
                cumulative_nll = (-torch.cumsum(gathered[0], dim=0)).tolist()
                token_ids = target_ids[0].tolist()
                tokens = tokenizer.convert_ids_to_tokens(token_ids)
                record = {
                    "model": args.model_name,
                    "variant": args.variant_name,
                    "group": args.group_name,
                    "prompt_idx": idx,
                    "char_limit": limit,
                    "chars_used": len(truncated),
                    "token_count": len(token_log_probs),
                    "token_log_probs": token_log_probs,
                    "cumulative_nll": cumulative_nll,
                    "tokens": tokens,
                }
                records.append(record)
                success = True
                log(f"prompt {idx}: success at {limit} chars (token_count={record['token_count']})")
                break
            except torch.cuda.OutOfMemoryError:
                torch.cuda.empty_cache()
                log(f"prompt {idx}: OOM at {limit} chars, retrying shorter")
                continue
        if not success:
            failures.append({"prompt_idx": idx, "reason": "all_attempts_failed"})
            log(f"prompt {idx}: FAILED after attempts {attempt_limits}")

    elapsed = time.time() - start_time
    with open(jsonl_path, "w", encoding="utf-8") as writer:
        for record in records:
            writer.write(json.dumps(record) + "\n")
    with open(summary_path, "w", encoding="utf-8") as writer:
        json.dump(
            {
                "model": args.model_name,
                "variant": args.variant_name,
                "group": args.group_name,
                "run_id": args.run_id,
                "prompt_count": len(prompts),
                "records": len(records),
                "failures": failures,
                "attempt_char_limits": attempt_limits,
                "elapsed_sec": elapsed,
            },
            writer,
            indent=2,
        )
    log(f"Completed run {args.run_id} in {elapsed:.1f}s (records={len(records)}, failures={len(failures)})")


if __name__ == "__main__":
    main()
"""
)

REMOTE_SCRIPT_BODY = textwrap.dedent(
    REMOTE_SCRIPT_TEMPLATE.substitute(MIN_TOKENS=MIN_PROMPT_TOKENS)
)


def sync_remote_runner_script() -> None:
    payload = REMOTE_SCRIPT_BODY.strip() + "\n"
    heredoc = f"cat <<'PY' > {shlex.quote(REMOTE_SCRIPT_PATH)}\n{payload}\nPY\nchmod +x {shlex.quote(REMOTE_SCRIPT_PATH)}"
    run_ssh(heredoc)
    print(f"Uploaded remote helper to {REMOTE_SCRIPT_PATH}")


sync_remote_runner_script()


Uploaded remote helper to /workspace/vastai-ssh-jupyter-pytorch/scripts/physics_logprob_remote.py


In [7]:
ALL_REMOTE_MODELS = sorted({spec.base for spec in MODEL_SPECS} | {spec.sft for spec in MODEL_SPECS})


def prefetch_models_on_remote() -> None:
    for model_name in tqdm(ALL_REMOTE_MODELS, desc="Prefetch models"):
        script = textwrap.dedent(
            f"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = \"{model_name}\"
print(f\"[prefetch] tokenizer {model_name}\")
AutoTokenizer.from_pretrained(model_name)
print(f\"[prefetch] model {model_name}\")
AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=\"cpu\")
print(f\"[prefetch] done {model_name}\")
"""
        ).strip()
        cmd = (
            f"{REMOTE_PYTHON} - <<'PY'\n{script}\nPY"
        )
        run_ssh(cmd)


# Uncomment to predownload everything before running prompts
# prefetch_models_on_remote()


In [8]:
from datetime import datetime

RUN_RECORDS: List[Dict[str, object]] = []


def build_run_id(spec: ModelSpec, variant: str, group: str) -> str:
    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    return f"{timestamp}_{spec.name}_{variant}_{group}"


def remote_output_dir(spec: ModelSpec, variant: str, group: str) -> str:
    return f"{REMOTE_OUTPUT_ROOT}/{group}/{spec.name}/{variant}"


def local_output_dir(spec: ModelSpec, variant: str, group: str) -> Path:
    path = LOCAL_OUTPUT_ROOT / group / spec.name / variant
    path.mkdir(parents=True, exist_ok=True)
    return path


def run_remote_logprob_job(spec: ModelSpec, variant: str, group: str, *, dry_run: bool = True) -> None:
    model_name = spec.base if variant == "base" else spec.sft
    tokenizer_name = spec.tokenizer
    remote_prompt = REMOTE_PROMPT_MAP[group]["remote_path"]
    run_id = build_run_id(spec, variant, group)
    remote_dir = remote_output_dir(spec, variant, group)
    ensure_remote_dir(remote_dir)
    attempt_arg = ",".join(str(val) for val in ATTEMPT_CHAR_LIMITS)
    arg_pairs = [
        ("--model-name", model_name),
        ("--tokenizer-name", tokenizer_name),
        ("--prompt-file", remote_prompt),
        ("--output-dir", remote_dir),
        ("--attempt-char-limits", attempt_arg),
        ("--min-tokens", str(MIN_PROMPT_TOKENS)),
        ("--dtype", spec.dtype),
        ("--device", spec.device),
        ("--run-id", run_id),
        ("--group-name", group),
        ("--variant-name", variant),
    ]
    arg_str = " ".join(f"{flag} {shlex.quote(value)}" for flag, value in arg_pairs)
    remote_cmd = textwrap.dedent(
        f"""
set -euo pipefail
{REMOTE_PYTHON} {shlex.quote(REMOTE_SCRIPT_PATH)} {arg_str}
"""
    ).strip()
    print(f"[run] {spec.name} ({variant}) group={group} -> {remote_dir}")
    if dry_run:
        print(remote_cmd)
        return
    run_ssh(remote_cmd)
    local_dir = local_output_dir(spec, variant, group)
    pull_remote_tree(remote_dir, local_dir)
    RUN_RECORDS.append(
        {
            "model": spec.name,
            "variant": variant,
            "group": group,
            "run_id": run_id,
            "remote_dir": remote_dir,
            "local_dir": str(local_dir),
        }
    )


def execute_full_sweep(*, dry_run: bool = True) -> None:
    for spec in MODEL_SPECS:
        for variant in ("base", "sft"):
            for group in REMOTE_PROMPT_MAP.keys():
                run_remote_logprob_job(spec, variant, group, dry_run=dry_run)


# Preview commands without running remote work
#execute_full_sweep(dry_run=True)


In [9]:
MANIFEST_PATH = LOCAL_OUTPUT_ROOT / "manifest.csv"


def load_existing_manifest() -> pd.DataFrame:
    if MANIFEST_PATH.exists():
        return pd.read_csv(MANIFEST_PATH)
    return pd.DataFrame(columns=["model", "variant", "group", "run_id", "local_dir", "remote_dir", "timestamp"])


def update_manifest(records: Optional[List[Dict[str, object]]] = None) -> pd.DataFrame:
    df = load_existing_manifest()
    entries = records or RUN_RECORDS
    if not entries:
        return df
    new_rows = []
    for entry in entries:
        new_rows.append(
            {
                "model": entry["model"],
                "variant": entry["variant"],
                "group": entry["group"],
                "run_id": entry["run_id"],
                "local_dir": entry["local_dir"],
                "remote_dir": entry["remote_dir"],
                "timestamp": datetime.utcnow().isoformat() + "Z",
            }
        )
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df.to_csv(MANIFEST_PATH, index=False)
    return df


def summarize_local_outputs() -> pd.DataFrame:
    rows = []
    for spec in MODEL_SPECS:
        for variant in ("base", "sft"):
            for group in PROMPT_CACHE.keys():
                local_dir = LOCAL_OUTPUT_ROOT / group / spec.name / variant
                jsonls = sorted(local_dir.glob("*.jsonl")) if local_dir.exists() else []
                summaries = sorted(local_dir.glob("summary_*.json")) if local_dir.exists() else []
                rows.append(
                    {
                        "model": spec.name,
                        "variant": variant,
                        "group": group,
                        "jsonl_files": len(jsonls),
                        "summary_files": len(summaries),
                        "local_dir": str(local_dir),
                    }
                )
    return pd.DataFrame(rows)


# Call after pulling remote outputs to refresh the manifest
# manifest_df = update_manifest()
# manifest_df

# summarize_local_outputs()


### Run + Manifest Checklist

1. Run `prefetch_models_on_remote()` once to cache checkpoints.
2. Set `execute_full_sweep(dry_run=False)` when ready to launch all model/group jobs. Monitor `run_*.log` files in each remote output directory.
3. After each job completes, call `pull_remote_tree(...)` or rerun `execute_full_sweep` for remaining entries (the helper skips completed directories).
4. Use `update_manifest()` to append the latest records and `summarize_local_outputs()` to verify that every group/model/variant has JSONL + summary artifacts under `notebooks/h200_outputs_perplexity/`.


In [10]:
execute_full_sweep(dry_run=False)

  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


[run] qwen-0_5b (base) group=physics_A -> /workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/physics_A/qwen-0_5b/base


CalledProcessError: Command '['ssh', '-p', '16693', '-i', 'C:\\Users\\spenc\\.ssh\\id_rsa', 'root@208.64.254.75', 'set -euo pipefail\npython3 /workspace/vastai-ssh-jupyter-pytorch/scripts/physics_logprob_remote.py --model-name Qwen/Qwen2.5-0.5B --tokenizer-name Qwen/Qwen2.5-0.5B-Instruct --prompt-file /workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/prompts/physics_A.txt --output-dir /workspace/vastai-ssh-jupyter-pytorch/notebooks/h200_outputs_perplexity/physics_A/qwen-0_5b/base --attempt-char-limits 1000,800,600,400 --min-tokens 30 --dtype float16 --device cuda:0 --run-id 20251130_013913_qwen-0_5b_base_physics_A --group-name physics_A --variant-name base']' returned non-zero exit status 1.