**README**: Before you run the notebook, ensure you have the following files in the Current Working Directory

- checkpoint.json

- proofs.zip

- proof_outcomes_by_problem.json

- train_one_model.py

# Initialization

In [1]:
!rm -r proofs

In [2]:
!pip install trl peft transformers torch_ema accelerate deepspeed mpi4py bitsandbytes



In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import json

model_dict = json.load(open("checkpoint.json"))

def parse_success_dict(model_dict: dict):
    """
    model_dict: mapping solver name -> binary string like "100101..." indicating
    which problems that solver solves.
    Returns solver names list and success_matrix of shape (n_solvers, n_problems).
    """
    solvers = list(model_dict.keys())
    # assume all strings are same length and consist of '0'/'1'
    success_matrix = np.array(
        [[int(c) for c in model_dict[name]] for name in solvers],
        dtype=int
    )  # shape: (n_solvers, n_problems)
    return solvers, success_matrix

solvers, success_matrix = parse_success_dict(model_dict['model_dict'])
print(solvers)
print(success_matrix)

['AI-MO_Kimina-Prover-Preview-Distill-7B', 'ByteDance-Seed_BFS-Prover', 'Goedel-LM_Goedel-Prover-SFT', 'deepseek-ai_DeepSeek-Prover-V1', 'deepseek-ai_DeepSeek-Prover-V1.5-RL', 'deepseek-ai_DeepSeek-Prover-V2-7B', 'kfdong_STP_model_Lean', 'stoney0062_Leanabell-Prover-DS-SFT', 'wellecks_llmstep-mathlib4-pythia2.8b']
[[1 0 0 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 1 1
  0 0 0 0]
 [1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1
  0 0 1 0]
 [1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 1 0
  0 0 1 0]
 [1 0 0 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1
  0 0 1 0]
 [1 0 0 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1 1
  0 0 1 1]
 [1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1 1 1 1
  0 0 1 1]
 [1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 1 1 1
  0 0 1 0]
 [1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [0 

# Greedy Selection

In [4]:
success_arr = np.zeros(success_matrix.shape[1], dtype=int)
ensemble = []
contribution: dict[str, int] = {}
success_arr_ = None
for i, solver in enumerate(solvers):
  success_arr_ = success_arr | success_matrix[i]
  diff = np.sum(success_arr_) - np.sum(success_arr)
  if diff > 0:
    contribution[solver] = diff
    success_arr = success_arr_
    ensemble.append(solver)

In [5]:
ensemble, len(ensemble)

(['AI-MO_Kimina-Prover-Preview-Distill-7B',
  'ByteDance-Seed_BFS-Prover',
  'Goedel-LM_Goedel-Prover-SFT',
  'deepseek-ai_DeepSeek-Prover-V1.5-RL',
  'deepseek-ai_DeepSeek-Prover-V2-7B'],
 5)

In [6]:
contribution.items()

dict_items([('AI-MO_Kimina-Prover-Preview-Distill-7B', np.int64(20)), ('ByteDance-Seed_BFS-Prover', np.int64(3)), ('Goedel-LM_Goedel-Prover-SFT', np.int64(3)), ('deepseek-ai_DeepSeek-Prover-V1.5-RL', np.int64(1)), ('deepseek-ai_DeepSeek-Prover-V2-7B', np.int64(2))])

In [7]:
ensemble_solve_idx = np.where(success_arr_ == 1)[0]
ensemble_no_solve_idx = np.where(success_arr_ == 0)[0]
ensemble_solve_idx, ensemble_no_solve_idx

(array([ 0,  1,  2,  3,  4,  5,  7,  8,  9, 10, 11, 13, 14, 15, 17, 20, 21,
        23, 25, 26, 27, 29, 31, 32, 33, 34, 35, 38, 39]),
 array([ 6, 12, 16, 18, 19, 22, 24, 28, 30, 36, 37]))

# Peer Fine-Tuning (PFT)
Implementation of $L_{prove}$

## Datasets

In [8]:
from datasets import load_dataset

ds = load_dataset('script-jpg/minif2f_test-first40-in-fix')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


In [9]:
ds["train"][0]

{'name': 'mathd_algebra_478',
 'informal_prefix': '/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\n',
 'formal_statement': 'import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units ∈ its volume? Show that it is 65.-/\ntheorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n    (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n'}

In [10]:
ds['train'][ensemble_solve_idx]['name']

['mathd_algebra_478',
 'numbertheory_4x3m7y3neq2003',
 'aime_1983_p1',
 'amc12_2001_p5',
 'mathd_algebra_141',
 'mathd_numbertheory_3',
 'mathd_algebra_209',
 'mathd_numbertheory_1124',
 'imo_1983_p6',
 'mathd_numbertheory_237',
 'mathd_algebra_33',
 'mathd_numbertheory_299',
 'amc12b_2020_p2',
 'algebra_sqineq_unitcircatbpabsamblt1',
 'mathd_algebra_419',
 'mathd_numbertheory_427',
 'numbertheory_x5neqy2p4',
 'mathd_algebra_398',
 'mathd_numbertheory_430',
 'mathd_algebra_459',
 'induction_12dvd4expnp1p20',
 'mathd_algebra_137',
 'mathd_numbertheory_277',
 'mathd_numbertheory_559',
 'mathd_algebra_160',
 'mathd_algebra_24',
 'mathd_algebra_176',
 'mathd_numbertheory_353',
 'numbertheory_notequiv2i2jasqbsqdiv8']

## Process Proofs

In [11]:
!unzip proofs.zip -d proofs

Archive:  proofs.zip
   creating: proofs/37/
   creating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/1.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/2.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/3.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/4.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/5.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/6.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/7.txt  
  inflating: proofs/37/AI-MO_Kimina-Prover-Preview-Distill-7B/8.txt  
   creating: proofs/37/ByteDance-Seed_BFS-Prover/
  inflating: proofs/37/ByteDance-Seed_BFS-Prover/1.txt  
  inflating: proofs/37/ByteDance-Seed_BFS-Prover/2.txt  
  inflating: proofs/37/ByteDance-Seed_BFS-Prover/3.txt  
  inflating: proofs/37/ByteDance-Seed_BFS-Prover/4.txt  
  inflating: proofs/37/ByteDance-Seed_BFS-Prover/5.txt

In [12]:
import json

with open('proof_outcomes_by_problem.json') as json_file:
    proof_outcomes_by_problem = json.load(json_file)

In [13]:
proof_outcomes_by_problem['0']['AI-MO_Kimina-Prover-Preview-Distill-7B']

[1, 0, 1, 0, 0, 1, 1, 0]

In [14]:
from pathlib import Path

def load_proofs(base_path: str | Path):
    base_path = Path(base_path)
    proof_texts = {}

    for file_path in base_path.glob('proofs/**/*/*.txt'):
        if any(part.startswith(".") for part in file_path.parts):
              continue
        path_parts = file_path.parts
        trial_id = file_path.stem
        model = path_parts[-2]
        problem_id = path_parts[-3]

        try:
            text_content = file_path.read_text(encoding='utf-8')
            proof_texts[(problem_id, model, int(trial_id))] = text_content
        except Exception as e:
            print(f"Could not read file {file_path}: {e}")

    return proof_texts
proof_texts = load_proofs("")

In [15]:
proof_texts[('0','AI-MO_Kimina-Prover-Preview-Distill-7B', 3)]

"-- The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.\n  -- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height.\n  -- So we can calculate that $V = 30 \\times 6.5 / 3 = 65$.\n  linarith\n```\n\n# Finding the Volume of a Cone\n\nI'll solve the problem of finding the volume of a cone with a base area of 30 square units and a height of 6.5 units.\n\nFirst, let's recall the formula for the volume of a cone: \n$V = \\frac{1}{3}Bh$\nwhere $B$ is the area of the base, and $h$ is the height.\n\nWe have $B = 30$ and $h = 6.5$. Let's substitute these values into the formula:\n$V = \\frac{1}{3} \\times 30 \\times 6.5$\n\nLet's calculate this step by step:\n$V = \\frac{1}{3} \\times 30 \\times 6.5 = 10 \\times 6.5 = 65$\n\nSo the volume of the cone should be 65 cubic units.\n\nNow, let's formalize this in Lean 4:\n\n```tactics\ni

## Construct HF Training Dataset

In [16]:
import random
from datasets import Dataset

def build_preference_dataset(data, problem_texts, proof_texts, seed=42):
    """
    data: dict like your example {problem_id: {model_name: [success/fail flags]}}
    problem_texts: dict mapping problem_id -> problem statement string
    proof_texts: dict mapping (problem_id, model_name, attempt_idx) -> proof string
    """
    random.seed(seed)
    rows = []

    for problem_id, model_dict in data.items():
        successes, failures = [], []

        for model_name, success_arr in model_dict.items():
            for attempt_idx, flag in enumerate(success_arr):
                proof_key = (problem_id, model_name, attempt_idx)
                proof_str = proof_texts.get(proof_key)
                if proof_str is None:
                    continue
                if flag == 1:
                    successes.append(proof_str)
                else:
                    failures.append(proof_str)

        # Append to dataset
        if successes and failures:
            # sample one success and one failure
            chosen = random.choice(successes)
            rejected = random.choice(failures)
            rows.append({
                "prompt": problem_texts[int(problem_id)]["formal_statement"],
                "chosen": chosen,
                "rejected": rejected
            })

    return Dataset.from_list(rows)

train_ds = build_preference_dataset(proof_outcomes_by_problem, ds['train'], proof_texts)

train_ds

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 29
})

In [17]:
train_ds[0]

{'prompt': 'import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n\n/-- The volume of a cone is given by the formula $V = \\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units ∈ its volume? Show that it is 65.-/\ntheorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\n    (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\n',
 'chosen': 'Substitute the given values of b and h into the volume formula.\n\nv = 1 / 3 * (30 * (13 / 2))\n\nCalculate the volume inside the parentheses.\n\n30 * (13 / 2) = 30 * 6.5 = 195\n\nNow, multiply by 1/3.\n\n1 / 3 * 195 = 65\n\nThus, the volume is 65 cubic units.\n-/\n  -- Substitute the given values of b and h into the volume formula.\n  subst_vars\n  -- Simplify the expression by clearing denominators and reducing fractions.\n  field_simp 

## Train

In [18]:
!mkdir trained_models

mkdir: cannot create directory ‘trained_models’: File exists


In [19]:
train_ds.save_to_disk("./cache/train_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/29 [00:00<?, ? examples/s]

In [23]:
import subprocess

for model_id in ensemble[:1]:
    process = subprocess.Popen(
        ["python", "-u", "train_one_model.py", model_id, "./cache/train_dataset"],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True
    )

    for line in process.stdout:
        print(line, end="")  # stream logs live

    process.wait()

2025-09-11 18:45:02.788048: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-11 18:45:02.805929: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757616302.827426   12853 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757616302.833905   12853 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757616302.850614   12853 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [24]:
!zip -r AI-MO_Kimina-Prover-Preview-Distill-7B_DPO.zip trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO

  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/ (stored 0%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/lora_adapters/ (stored 0%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/lora_adapters/README.md (deflated 65%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/lora_adapters/adapter_config.json (deflated 55%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/lora_adapters/adapter_model.safetensors (deflated 7%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/checkpoint-400/ (stored 0%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/checkpoint-400/added_tokens.json (deflated 67%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/checkpoint-400/vocab.json (deflated 61%)
  adding: trained_models/AI-MO_Kimina-Prover-Preview-Distill-7B_DPO/checkpoint-400/optimizer.pt (deflated 8%)
  adding: trained_models/AI-MO_Kimina-Prover-Pr