In [None]:
# !unzip proofs.zip -d proofs

In [1]:
# Step 1: Install elan (Lean toolchain manager)
!curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y

# Step 2: Update Python process PATH so subprocess.run() can find `lean`
import os
elan_bin_path = os.path.expanduser("~/.elan/bin")
os.environ["PATH"] = elan_bin_path + ":" + os.environ["PATH"]

# Verify the installation by checking the version
!lean --version

import os
import subprocess

def setup_lean_project(project_dir="/tmp/lean_project"):
    """
    Creates a Lean project, configures it to use Mathlib,
    and downloads pre-compiled library files.
    """
    print(f"--- Setting up Lean project in: {project_dir} ---")
    os.makedirs(project_dir, exist_ok=True)

    # Content for the lakefile.lean
    lakefile_content = """
    import Lake
    open Lake DSL

    package «lean_project»

    require mathlib from git
      "https://github.com/leanprover-community/mathlib4.git"

    @[default_target]
    lean_lib «lean_project»
    """
    # Write the lakefile
    with open(os.path.join(project_dir, "lakefile.lean"), "w") as f:
        f.write(lakefile_content)

    # Run `lake exe cache get` to download Mathlib's pre-compiled files
    # This is much faster than building from source.
    print("--- Downloading Mathlib cache (this may take a few minutes)... ---")
    try:
        subprocess.run(
            ["lake", "exe", "cache", "get"],
            cwd=project_dir,
            check=True,
            capture_output=True,
            text=True
        )
        print("--- Mathlib cache downloaded successfully. ---")
    except subprocess.CalledProcessError as e:
        print("❌ Error setting up Mathlib cache.")
        print(f"--- STDOUT ---\n{e.stdout}")
        print(f"--- STDERR ---\n{e.stderr}")
        raise  # Stop execution if setup fails

    return project_dir

# --- Call this function once at the start of your script ---
lean_project_path = setup_lean_project()
lean_project_path

[1minfo:[0m downloading installer
[1minfo: [mdefault toolchain set to 'stable'
Lean (version 4.24.0, x86_64-unknown-linux-gnu, commit 797c613eb9b6d4ec95db23e3e00af9ac6657f24b, Release)
--- Setting up Lean project in: /tmp/lean_project ---
--- Downloading Mathlib cache (this may take a few minutes)... ---
--- Mathlib cache downloaded successfully. ---


'/tmp/lean_project'

In [2]:
import utils
print(utils.get_proof_variants)

<function get_proof_variants at 0x720dfc395ab0>


In [3]:
import inspect
print(inspect.getsource(utils.get_proof_variants))

def get_proof_variants(s: str) -> List[str]:
    return [s] + apply_bulk_strategies(s)



In [44]:
import subprocess
import os
import re
# from concurrent.futures import ProcessPoolExecutor, as_completed
from utils import get_proof_variants
from typing import Dict
import threading
import concurrent.futures
import tempfile

LOG_PATH = os.path.expanduser("~/error.log")   # expand ~ -> /home/you/...
os.makedirs(os.path.dirname(LOG_PATH) or ".", exist_ok=True)
_log_lock = threading.Lock()

def check_lean_proof(proof_and_context: Dict, log_errors=False, max_workers=None) -> bool:
    """
    Checks a Lean‑4 proof string inside the given project using `lake`.
    If any variant succeeds, the *first* successful proof is saved to:
        corrected_proofs/<problem_id>/<proof_solver>/<attempt_id>.txt
    Returns True if a proof was saved, otherwise False.
    """
    # Verify the top‑level keys that must be present
    assert "proof" in proof_and_context, \
        "Missing 'proof' key – you need a proof string to test."
    assert "formal_statement" in proof_and_context, \
        "Missing 'formal_statement' key – you have to give the theorem statement."
    assert "project_dir" in proof_and_context, \
        "Missing 'project_dir' key – cannot locate the Lean project."
    assert "metadata" in proof_and_context, \
        "Missing 'metadata' key – you'll need context such as attempt_id."

    # Verify the required nested keys inside metadata
    assert "attempt_id" in proof_and_context["metadata"], \
        "Metadata lacks 'attempt_id' – needed to name the output file."
    assert "problem_id" in proof_and_context["metadata"], \
        "Metadata lacks 'problem_id' – needed for the directory structure."
    assert "proof_solver" in proof_and_context["metadata"], \
        "Metadata lacks 'proof_solver' – you need to know which solver produced this."

    # Unpack everything we need
    proof_string   = proof_and_context["proof"]
    statement      = proof_and_context["formal_statement"]
    project_dir    = proof_and_context["project_dir"]

    metadata       = proof_and_context["metadata"]
    attempt_id     = metadata["attempt_id"]
    problem_id     = metadata["problem_id"]
    solver_name    = metadata["proof_solver"]

    # Where the successful proof will be written.
    save_dir = os.path.join(
        "corrected_proofs", problem_id, solver_name
    )
    os.makedirs(save_dir, exist_ok=True)

    # Build every candidate proof.
    proof_variants = get_proof_variants(proof_string)

    # Each variant becomes a tiny Lean file: statement + proof.
    candidates = [
        f"{statement}\n{variant}" for variant in proof_variants
    ]

    def check_single_variant(idx, code):
        """Check a single proof variant and return (success, variant_index)"""
        # Use a temporary file with a unique name
        with tempfile.NamedTemporaryFile(mode='w', suffix='.lean', dir=project_dir, delete=False) as f:
            f.write(code)
            temp_path = f.name

        try:
            # Run Lean via lake.
            desired = 100_000
            command = [
                "lake", "env", "lean",
                f"-DmaxRecDepth={desired}",
                os.path.basename(temp_path)  # Use basename since we're in project_dir
            ]
            result = subprocess.run(
                command,
                cwd=project_dir,
                capture_output=True,
                text=True,
                timeout=120,
            )

            # Success = returncode 0 and no "error:" in stdout.
            if result.returncode == 0 and "error:" not in result.stdout:
                return (True, idx)
            
            if log_errors and "error:" in result.stdout:
                with _log_lock:
                    with open(LOG_PATH, "a", encoding="utf-8") as g:
                        g.writelines(result.stdout)
            
            return (False, idx)
            
        except Exception as e:
            print(f"Exception ({type(e).__name__}) for variant {idx}: {e}")
            proc = locals().get("result")
            if proc is not None:
                print("---- subprocess stdout ----")
                print(proc.stdout or "<no stdout>")
                print("---- subprocess stderr ----")
                print(proc.stderr or "<no stderr>")
            return (False, idx)
        finally:
            # Clean up the temp file
            try:
                os.remove(temp_path)
            except Exception:
                pass

    # Use ThreadPoolExecutor for parallel execution
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all variants for parallel execution
        future_to_index = {
            executor.submit(check_single_variant, idx, code): idx 
            for idx, code in enumerate(candidates)
        }

        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_index):
            success, variant_idx = future.result()
            if success:
                # Cancel remaining tasks since we found a working proof
                for f in future_to_index:
                    f.cancel()
                
                # Save the successful proof
                out_path = os.path.join(save_dir, f"{attempt_id}.txt")
                with open(out_path, "w", encoding="utf-8") as out_f:
                    out_f.write(proof_variants[variant_idx])
                
                return True

    # No variant succeeded
    return False

def check_proofs_in_parallel(proof_contexts: list[dict], parallel_workers: int = None) -> bool:
    """
    Runs multiple proof checks in parallel, numbered 1..len(proof_contexts).
    Returns True on first success.
    """
    if not proof_contexts:
        return False

    if parallel_workers is None:
        parallel_workers = os.cpu_count() or 1

    with concurrent.futures.ProcessPoolExecutor(max_workers=parallel_workers) as executor:
        futures = {
            executor.submit(check_lean_proof, ctx): idx
            for idx, ctx in enumerate(proof_contexts, start=1)
        }
        for future in concurrent.futures.as_completed(futures):
            try:
                if future.result():
                    return True  # early exit on first successful attempt
            except Exception:
                pass  # optionally log
    return False


In [29]:
# 1. Define the proof and context with the Mathlib header
correct_proof_dict = {
    'formal_statement': 'import Mathlib.Tactic\ntheorem two_plus_two_is_four : 2 + 2 = 4',
    'proof': ':= by rfl',
    'project_dir': lean_project_path,
    'metadata': {'proof_solver': 'example_solver', 'problem_id': 'example_id', 'attempt_id': '1'}
}


check_lean_proof(correct_proof_dict)


True

# Assumptions

In [9]:
# assert that the folder structure is as expected
import os
from typing import Tuple

def check_structure(path: str):
    def check_numeric_children_consecutive(path: str, require_start_zero: bool = True) -> Tuple[int, int]:
        """
        Validate that:
        * `path` exists and is a directory with at least one child.
        * Immediate children are directories named in canonical natural-number form:
            "0", "1", "2", ... (no leading zeros except "0").
        * Their integer values form a consecutive range from min to max.
        * If require_start_zero is True, the range must start at 0.

        Returns:
            (min_value, max_value)

        Raises:
            AssertionError on any violation.
        """
        if not os.path.isdir(path):
            raise AssertionError(f"{path!r} is not a directory")
        entries = os.listdir(path)
        assert entries, f"{path!r} is empty"

        nat_canonical = re.compile(r"0|[1-9][0-9]*\Z")
        nums = []
        for name in entries:
            full = os.path.join(path, name)
            assert os.path.isdir(full), f"{full!r} is not a directory"
            assert nat_canonical.fullmatch(name), (
                f"{name!r} is not a canonical natural number ('0', '1', '2', ... without leading zeros)"
            )
            nums.append(int(name))

        min_n, max_n = min(nums), max(nums)
        if require_start_zero:
            assert min_n == 0, f"Sequence must start at 0 but starts at {min_n}"
        expected = set(range(min_n, max_n + 1))
        actual = set(nums)
        if actual != expected:
            missing = sorted(expected - actual)
            extra = sorted(actual - expected)
            msg = f"Immediate numeric directory names {sorted(entries)} do not form a consecutive range {min_n}..{max_n}"
            if missing:
                msg += f"; missing {missing}"
            if extra:
                msg += f"; unexpected {extra}"
            raise AssertionError(msg)
        return min_n, max_n
    # assert that it's a directory
    assert os.path.isdir(path), f"{path} is not a directory"

    # assert directory is not empty
    assert any(os.listdir(path)), f"{path} is empty"

    # assert only folders in the first level
    assert all(os.path.isdir(os.path.join(path, subdir)) for subdir in os.listdir(path)), f"Not all items in {path} are directories"

    # assert that all first level subdirectories are nats and ordered
    check_numeric_children_consecutive(path)

    # go into folder titled path/0 and find the set of folder names
    zero_folder = os.path.join(path, "0")
    assert os.path.isdir(zero_folder), f"{zero_folder} is not a directory"

    folder_names = {name for name in os.listdir(zero_folder) if os.path.isdir(os.path.join(zero_folder, name))}

    # assert that set of folder_names is the same across all number directories
    for subdir in os.listdir(path):
        subdir_path = os.path.join(path, subdir)
        if os.path.isdir(subdir_path) and subdir != "0":
            subdir_folder_names = {name for name in os.listdir(subdir_path) if os.path.isdir(os.path.join(subdir_path, name))}
            assert subdir_folder_names == folder_names, f"Folder names in {subdir_path} do not match those in {zero_folder}"

    # assert that all subfolders have the same file names e.g. path/0/subfolder/[1..8].txt matches path/1/
    file_set = None
    for folder in os.listdir(path):
        folder_path = os.path.join(path, folder)
        for subdir in os.listdir(folder_path):
            assert os.path.isdir(os.path.join(folder_path, subdir)), f"{subdir} is not a directory"
            file_names = {name for name in os.listdir(os.path.join(folder_path, subdir)) if os.path.isfile(os.path.join(folder_path, subdir, name))}
            if file_set is None:
                file_set = file_names
            else:
                assert file_set == file_names, f"File names in {subdir} are not consistent with other subdirectories"

# Assumes you have a folder called proofs in the current working directory with the structure enforced by the check_structure function
check_structure("proofs_miniF2F-test")


In [10]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [11]:
from datasets import load_dataset
miniF2F_test_df = load_dataset("HaimingW/miniF2F-lean4", split="test").to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import os
import json
from typing import Any, Union

def _read_text_file(path: str) -> str:
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except UnicodeDecodeError:
        with open(path, "r", encoding="latin-1") as f:
            return f.read()

def build_structure(path: str) -> Union[dict[str, Any], str, None]:
    """
    Recursively walk `path`:
      - If it's a directory, return a dict of its children.
      - If it's a .txt file, return its text content.
      - Other files return None (and are skipped by the caller).
    """
    if os.path.isdir(path):
        out: dict[str, Any] = {}
        for entry in sorted(os.listdir(path)):
            full = os.path.join(path, entry)
            if os.path.isdir(full):
                out[entry] = build_structure(full)
            elif os.path.isfile(full) and entry.lower().endswith(".txt"):
                key = os.path.splitext(entry)[0]
                out[key] = _read_text_file(full)
        return out
    elif os.path.isfile(path) and path.lower().endswith(".txt"):
        return _read_text_file(path)
    else:
        return None  # non-txt file or missing; caller can ignore

structure = build_structure("proofs_miniF2F-test")
structure['0']['AI-MO_Kimina-Prover-Preview-Distill-7B']

{'1': 'have : v = 65 := by\n    calc\n      _ = 1 / 3 * (b * h) := by rw [h₁]\n      _ = 1 / 3 * (30 * 13 / 2) := by rw [h₂, h₃]\n      _ = 65 := by norm_num\n\n  assumption\n```',
 '10': '-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height.\n  rw [h₁, h₂, h₃]\n  -- The area of the base of a cone is 30 square units, and its height is 6.5 units.\n  norm_num\n  -- What is the number of cubic units in its volume? Show that it is 65.\n  -- 6.5 = 13/2.\n  -- (13/2) * 30 = 195.\n  -- 195 * (1/3) = 65.\n  -- Thus, the volume of the cone is 65 cubic units.\n```',
 '11': 'have h₄ : v = 65 := by\n    calc\n      _ = 1 / 3 * (b * h) := by rw [h₁]\n      _ = 1 / 3 * (30 * 13 / 2) := by rw [h₂, h₃]\n      _ = 65 := by norm_num\n\n  exact h₄\n```',
 '12': 'have : b * h = 30 * (13 / 2) := by rw [h₂, h₃]\n\n  calc\n    v = 1 / 3 * (b * h) := by rw [h₁]\n    _ = 1 / 3 * (30 * (13 / 2)) := by rw [this]\n    _ = 65 := by norm_num\n

In [14]:
miniF2F_test_df.head(5)

Unnamed: 0,name,split,informal_prefix,formal_statement,goal,header,formal_proof
0,mathd_algebra_478,test,/-- The volume of a cone is given by the formu...,theorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 ...,b h v : ℝ\nh₀ : 0 < b ∧ 0 < h ∧ 0 < v\nh₁ : v ...,import Mathlib\nimport Aesop\n\nset_option max...,
1,numbertheory_4x3m7y3neq2003,test,/-- Show that there are no integers $x$ and $y...,theorem numbertheory_4x3m7y3neq2003 (x y : ℤ) ...,x y : ℤ\n⊢ 4 * x ^ 3 - 7 * y ^ 3 ≠ 2003,import Mathlib\nimport Aesop\n\nset_option max...,
2,aime_1983_p1,test,"/-- Let $x$, $y$ and $z$ all exceed $1$ and le...",theorem aime_1983_p1 (x y z w : ℕ) (ht : 1 < x...,x y z w : ℕ\nht : 1 < x ∧ 1 < y ∧ 1 < z\nhw : ...,import Mathlib\nimport Aesop\n\nset_option max...,
3,amc12_2001_p5,test,/-- What is the product of all positive odd in...,theorem amc12_2001_p5 :\n Finset.prod (Fins...,⊢ (Finset.filter (fun x => ¬Even x) (Finset.ra...,import Mathlib\nimport Aesop\n\nset_option max...,
4,mathd_algebra_141,test,/-- A rectangular patio has an area of $180$ s...,theorem mathd_algebra_141 (a b : ℝ) (h₁ : a * ...,a b : ℝ\nh₁ : a * b = 180\nh₂ : 2 * (a + b) = ...,import Mathlib\nimport Aesop\n\nset_option max...,


In [15]:
import os
import json
from typing import Dict
from tqdm.auto import tqdm  # or: from tqdm import tqdm

In [45]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional, Callable

def check_proofs_in_parallel_output_arr(
    proof_contexts: List[dict],
    parallel_workers: int | None = None,
    on_progress: Optional[Callable[[int], None]] = None,
    log_errors = False
) -> List[int]:
    """
    Runs multiple proof checks in parallel (threaded).
    Returns a list of 1/0 aligned with the input order.
    Calls on_progress(1) each time an attempt completes.
    """
    # assert isinstance(parallel_workers, int) and parallel_workers > 1, "parallel_workers must be > 1"
    if not proof_contexts:
        return []

    results: List[int] = [0] * len(proof_contexts)

    def _run(idx: int, ctx: dict) -> Tuple[int, int]:
        name = f"proof_{idx}"
        ok = check_lean_proof(ctx, log_errors)
        return idx, 1 if ok else 0

    with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
        futures = {executor.submit(_run, i, ctx): i for i, ctx in enumerate(proof_contexts, start=1)}
        for future in as_completed(futures):
            idx = futures[future]
            try:
                i, val = future.result()
            except Exception as exc:
                print(exc)
                i, val = idx, 0
            results[i - 1] = val
            if on_progress:
                on_progress(1)

    return results

In [33]:
row1 = miniF2F_test_df.loc[8]

correct_proof_dict = [{
    'formal_statement': row1['header']+"\n"+row1['formal_statement']+"\n",
    'proof': structure['8']['AI-MO_Kimina-Prover-Preview-Distill-7B'][str(i)],
    'project_dir': lean_project_path,
    'metadata': {'proof_solver': 'AI-MO_Kimina-Prover-Preview-Distill-7B', 'problem_id': '8', 'attempt_id': str(i)}
} for i in range(1,25)]

check_proofs_in_parallel_output_arr(correct_proof_dict)

[1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0]

In [46]:
row1 = miniF2F_test_df.loc[0]

correct_proof_dict = [{
    'formal_statement': row1['header']+"\n"+row1['formal_statement']+"\n",
    'proof': structure['0']['deepseek-ai_DeepSeek-Prover-V2-7B'][str(i)],
    'project_dir': lean_project_path,
    'metadata': {'proof_solver': 'deepseek-ai_DeepSeek-Prover-V2-7B', 'problem_id': '0', 'attempt_id': str(i)}
} for i in range(1,25)]

check_proofs_in_parallel_output_arr(correct_proof_dict)

[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0]

In [55]:
!rm -rf corrected_proofs

# Evaluation Loop

In [56]:
!mkdir corrected_proofs

In [None]:
import os
import numpy as np
from tqdm.auto import tqdm  # progress bar
from bitarray import bitarray
import json

# Define the checkpoint functions
def _load_checkpoint(path: str, num_problems: int) -> Dict[str, bitarray]:
    if not os.path.exists(path):
        return {}
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    model_dict: Dict[str, bitarray] = {}
    for model, bitstr in data.get("model_dict", {}).items():
        ba = bitarray(bitstr)
        if len(ba) < num_problems:
            ba.extend([False] * (num_problems - len(ba)))
        elif len(ba) > num_problems:
            ba = ba[:num_problems]
        model_dict[model] = ba
    return model_dict

def _save_checkpoint(path: str, model_dict: Dict[str, bitarray]) -> None:
    tmp = f"{path}.tmp"
    serializable = {"model_dict": {m: ba.to01() for m, ba in model_dict.items()}}
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(serializable, f, ensure_ascii=False)
    os.replace(tmp, path)  # atomic-ish swap

structure_json = build_structure("proofs_miniF2F-test")

print(structure_json)

# Robustly find the checkpoint path
checkpoint_path_candidates = [
    "checkpoint.json",
]

checkpoint_path = next((p for p in checkpoint_path_candidates if os.path.exists(p)), "checkpoint.json")

# Derive problem/model/attempt axes from the verified folder structure
problem_ids = sorted(structure_json.keys(), key=lambda x: int(x))
first_problem = problem_ids[0]
model_names = sorted(structure_json[first_problem].keys())
# assume consistent attempt ids across models/problems (enforced by your check_structure)
attempt_ids = sorted(structure_json[first_problem][model_names[0]].keys(), key=lambda x: int(x))

num_problems = len(problem_ids)
num_models = len(model_names)
num_attempts = len(attempt_ids)

df = miniF2F_test_df

# Load or initialize checkpoint
resume = True  # You can make this a parameter
model_dict = _load_checkpoint(checkpoint_path, num_problems) if resume else {}

# Ensure all models are initialized in the checkpoint
for model in model_names:
    if model not in model_dict:
        ba = bitarray(num_problems)
        ba.setall(False)
        model_dict[model] = ba

# Initialize the 3D result tensor (problems × models × attempts) with zeros
results = np.zeros((num_problems, num_models, num_attempts), dtype=np.uint8)

prev_success = None # will hold the result of the *last* pair

with tqdm(total=num_models * num_problems,
          desc="Evaluating (problem, model) pairs",
          unit="pair",
          leave=True,
          dynamic_ncols=True) as pbar:

    # CHANGED: Outer loop is now problems, inner loop is models
    for p_axis_idx, p_str in enumerate(problem_ids):
        for m_idx, model in enumerate(model_names):
            # Check if this model already succeeded on this problem
            if model_dict[model][p_axis_idx]:
                # Skip this pair since we already know it succeeded
                pbar.set_description(f"model={model} prob={p_str}  skipped", refresh=False)
                # Mark all attempts as successful in results (since we know at least one worked)
                results[p_axis_idx, m_idx, :] = 1
                prev_success = "✔"
                pbar.update(1)
                continue

            # Show the current model & problem and the result of the previous pair
            cur_desc = f"model={model} prob={p_str}"
            if prev_success is not None:
                cur_desc += f"  prev_success={prev_success}"

            # `refresh=False` prevents an extra refresh
            pbar.set_description(cur_desc, refresh=False)

            attempts_dict = structure_json[p_str][model]
            formal_statement = miniF2F_test_df.iloc[p_axis_idx]["formal_statement"]
            header = df.iloc[p_axis_idx]["header"]

            proof_contexts = []
            for a_str in attempt_ids:
                proof_text = attempts_dict.get(a_str, "")
                proof_contexts.append({
                    "formal_statement": header+"\n"+formal_statement+"\n",
                    "proof":            proof_text,
                    "project_dir":      lean_project_path,
                    "metadata": {
                        "proof_solver": model,
                        "problem_id":   p_str,
                        "attempt_id":   a_str
                    }
                })

            # Run the checks in parallel
            outcomes = check_proofs_in_parallel_output_arr(
                proof_contexts
            )

            # Did *any* attempt succeed for this (model, problem) pair?
            this_success = any(outcomes)
            prev_success = "✔" if this_success else "✘" # store for the *next* iteration

            # Store the result in the big numpy array
            results[p_axis_idx, m_idx, :] = np.fromiter(
                outcomes, dtype=np.uint8, count=num_attempts
            )

            # Update checkpoint if this pair succeeded
            if this_success:
                model_dict[model][p_axis_idx] = True
                _save_checkpoint(checkpoint_path, model_dict)

            # Finally advance the bar by one pair
            pbar.update(1)

# Final checkpoint save
_save_checkpoint(checkpoint_path, model_dict)

print("results shape:", results.shape)  # (num_problems, num_models, num_attempts)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

model=deepseek-ai_DeepSeek-Prover-V2-7B prob=0  prev_success=✔:   0%|          | 3/732 [01:20<5:28:56, 27.07s/pair]

In [None]:
import json
from typing import List

def results_to_nested_dict(
    results,                      # numpy array, shape: (num_problems, num_models, num_attempts)
    model_names: List[str],
    problem_ids: List[str],
) -> Dict[str, Dict[str, list]]:
    """Return {model_id: {problem_id: [0/1, ...]}}."""
    num_problems, num_models, num_attempts = results.shape
    assert len(model_names) == num_models
    assert len(problem_ids) == num_problems

    payload: Dict[str, Dict[str, list]] = {}
    for m_idx, model in enumerate(model_names):
        per_model: Dict[str, list] = {}
        for p_idx, prob in enumerate(problem_ids):
            per_model[str(prob)] = results[p_idx, m_idx, :].astype(int).tolist()
        payload[str(model)] = per_model
    return payload

def save_results_nested_json(
    results,
    model_names: List[str],
    problem_ids: List[str],
    out_path: str = "proof_outcomes_by_model.json",
) -> str:
    payload = results_to_nested_dict(results, model_names, problem_ids)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2, ensure_ascii=False)
    return out_path

# Example:
# save_results_nested_json(results, model_names, problem_ids, "proof_outcomes_by_model.json")


In [None]:
save_results_nested_json(results, model_names, problem_ids, "proof_outcomes_by_model.json")

In [None]:
def results_to_problem_nested_dict(
    results,                      # numpy array, shape: (num_problems, num_models, num_attempts)
    model_names: List[str],
    problem_ids: List[str],
) -> Dict[str, Dict[str, list]]:
    """
    Return a dict nested by problem, then model:

    {
      "<problem_id>": {
        "<model_id>": [0/1, 0/1, ...],
        ...
      },
      ...
    }
    """
    num_problems, num_models, _ = results.shape
    assert len(model_names) == num_models
    assert len(problem_ids) == num_problems

    payload: Dict[str, Dict[str, list]] = {}
    for p_idx, prob in enumerate(problem_ids):
        per_problem: Dict[str, list] = {}
        for m_idx, model in enumerate(model_names):
            per_problem[str(model)] = results[p_idx, m_idx, :].astype(int).tolist()
        payload[str(prob)] = per_problem
    return payload


def save_results_nested_by_problem_json(
    results,
    model_names: List[str],
    problem_ids: List[str],
    out_path: str = "proof_outcomes_by_problem.json",
) -> str:
    """
    Writes JSON nested by problem → model → outcomes.
    """
    payload = results_to_problem_nested_dict(results, model_names, problem_ids)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2, ensure_ascii=False)
    return out_path


In [None]:
save_results_nested_by_problem_json(results, model_names, problem_ids, "proof_outcomes_by_problem.json")


In [None]:
!zip -r corrected_proofs.zip corrected_proofs