In [1]:
# Step 1: Install elan (Lean toolchain manager)
!curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y

# Step 2: Update Python process PATH so subprocess.run() can find `lean`
import os
elan_bin_path = os.path.expanduser("~/.elan/bin")
os.environ["PATH"] = elan_bin_path + ":" + os.environ["PATH"]

# Verify the installation by checking the version
!lean --version

[1minfo:[0m downloading installer
[1minfo: [mdefault toolchain set to 'stable'
Lean (version 4.21.0, x86_64-unknown-linux-gnu, commit 6741444a63ee, Release)


In [2]:
import os
import subprocess

def setup_lean_project(project_dir="/tmp/lean_project"):
    """
    Creates a Lean project, configures it to use Mathlib,
    and downloads pre-compiled library files.
    """
    print(f"--- Setting up Lean project in: {project_dir} ---")
    os.makedirs(project_dir, exist_ok=True)

    # Content for the lakefile.lean
    lakefile_content = """
    import Lake
    open Lake DSL

    package «lean_project»

    require mathlib from git
      "https://github.com/leanprover-community/mathlib4.git"

    @[default_target]
    lean_lib «lean_project»
    """
    # Write the lakefile
    with open(os.path.join(project_dir, "lakefile.lean"), "w") as f:
        f.write(lakefile_content)

    # Run `lake exe cache get` to download Mathlib's pre-compiled files
    # This is much faster than building from source.
    print("--- Downloading Mathlib cache (this may take a few minutes)... ---")
    try:
        subprocess.run(
            ["lake", "exe", "cache", "get"],
            cwd=project_dir,
            check=True,
            capture_output=True,
            text=True
        )
        print("--- Mathlib cache downloaded successfully. ---")
    except subprocess.CalledProcessError as e:
        print("❌ Error setting up Mathlib cache.")
        print(f"--- STDOUT ---\n{e.stdout}")
        print(f"--- STDERR ---\n{e.stderr}")
        raise  # Stop execution if setup fails

    return project_dir

# --- Call this function once at the start of your script ---
lean_project_path = setup_lean_project()

--- Setting up Lean project in: /tmp/lean_project ---
--- Downloading Mathlib cache (this may take a few minutes)... ---
--- Mathlib cache downloaded successfully. ---


In [3]:
lean_project_path

'/tmp/lean_project'

In [4]:
import subprocess
import tempfile
import os
from multiprocessing import Pool
import random
import re

def clean_markdown_fences(code: str) -> str:
    """
    Delete lines that start with one‒three backticks (optionally followed by 'lean').
    Leaves all other back-quotes intact.
    """
    fence_pattern = r'^\s*```.*$'       # lines like ``` or ```lean
    return re.sub(fence_pattern, '', code, flags=re.MULTILINE)


def check_lean_proof(proof_and_context: dict) -> bool:
    """
    Checks a Lean 4 proof string within a configured Lean project using lake.
    """
    proof_string = proof_and_context['proof']
    header = proof_and_context['header']
    statement = proof_and_context['formal_statement']
    project_dir = proof_and_context['project_dir']

    # proof_string = re.sub(r'^\s*:=[ \t]+by\b\s*', '', proof_string, count=1)

    full_code = clean_markdown_fences(f"{header}\n\n{statement}\n  {proof_string}")

    # The file must have a .lean extension
    temp_filename = "temp_proof.lean"
    file_path = os.path.join(project_dir, temp_filename)

    try:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(full_code)

        # --- KEY CHANGE: Use 'lake env lean' to run the compiler ---
        command = ["lake", "env", "lean", temp_filename]

        result = subprocess.run(
            command,
            cwd=project_dir,
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode == 0 and "error:" not in result.stdout:
            print(f"✅ Proof is correct!\n--- Full Code ---\n{full_code}\n**DONE**")
            return True
        else:
            output = ""
            if result.stdout:
                output += f"\n--- Lean Output (stdout) ---\n{result.stdout}"
            if result.stderr:
                output += f"\n--- Lean Error Output (stderr) ---\n{result.stderr}"
            # print(output)
            # ocasionally print out the full proof full_code
            if random.random() <= 1/64:
              print(f"❌ Proof is incorrect or contains errors.\n{output}\n--- Full Code ---\n{full_code}\n**DONE**")
            return False

    except Exception as e:
        print(f"An error occurred during proof checking: {e}")
        return False

def check_proofs_in_parallel(proof_contexts: list[dict], parallel_workers: int = None) -> bool:
    """
    Checks a list of proof contexts in parallel.

    Args:
        proof_contexts: A list of dictionaries, each containing proof and context.
    """
    if not proof_contexts:
        return False

    with Pool(processes=parallel_workers) as p:
        results = p.map(check_lean_proof, proof_contexts)

    return any(results)

In [5]:
# 1. Define the proof and context with the Mathlib header
correct_proof_dict = {
    'header': 'import Mathlib.Tactic',  # Added Mathlib header for testing
    'formal_statement': 'theorem two_plus_two_is_four : 2 + 2 = 4',
    'proof': ':= by rfl',
    'project_dir': lean_project_path # Make sure to include the project path
}

# 2. Call the function with the dictionary
is_valid = check_lean_proof(correct_proof_dict)

# This should now be True, confirming the project environment works
# print(f"Is the proof valid? {is_valid}")

✅ Proof is correct!
--- Full Code ---
import Mathlib.Tactic

theorem two_plus_two_is_four : 2 + 2 = 4
  := by rfl
**DONE**


In [6]:
# 1. Define the proof and context, omitting the unnecessary header
correct_proof_dict = {
    'header': '',  # Remove or set to empty string
    'formal_statement': 'theorem two_plus_two_is_four : 2 + 2 = 5',
    'proof': 'rfl',
    'project_dir': lean_project_path
}

# 2. Call the function with the corrected dictionary
is_valid = check_lean_proof(correct_proof_dict)

# print(f"Is the proof valid? {is_valid}")

In [7]:
!git clone https://github.com/alycialee/beyond-scale-language-data-diversity.git
%cd beyond-scale-language-data-diversity

# 2️⃣  Make sure the build tools are modern enough
# %pip install --quiet --upgrade pip setuptools wheel

%pip install pip==24.0

# 3️⃣  Editable-install *into the live kernel*  ← note the %pip magic
%pip install -e .

fatal: destination path 'beyond-scale-language-data-diversity' already exists and is not an empty directory.
/content/beyond-scale-language-data-diversity
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mObtaining file:///content/beyond-scale-language-data-diversity
  Preparing metadata (setup.py) ... [?25l[?25hdone
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://git

In [8]:
!pip install -U datasets
!pip install fastdtw
!pip install tf-keras

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest 

In [9]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd() / "src"))
sys.path.insert(0, str(pathlib.Path.cwd() / "src" / "diversity"))

In [10]:
def make_loss_fn(ignore_id):
    """Factory function to create a cross-entropy loss function."""
    def _loss_fn(logits, tgt, *_, **__):
        logits = logits[:, :-1, :].contiguous()
        tgt = tgt[:, 1:].contiguous()
        return F.cross_entropy(logits.view(-1, logits.size(-1)), tgt.view(-1), ignore_index=ignore_id)
    return _loss_fn

In [11]:
import os
# ---------------------------------------------------------------------------
# Helper: one call = one embedding/LM-loss for a given solver tokenizer
# ---------------------------------------------------------------------------
def embed_with_solver_tokenizer(
    solver_model_id: str,
    raw_dataset,
    probe_model,
    max_len: int,
    epochs: int = 1,
    cpu_workers: int = os.cpu_count(),
):
    """
    Parameters
    ----------
    solver_model_id : str
        Hugging Face model ID from which to load the tokenizer.
    raw_dataset : datasets.Dataset
        The *raw_ds* already built earlier in the script.
    probe_model : transformers.PreTrainedModel
        The already-constructed probe (distilgpt2 here).
    max_len : int
        Sequence length to which examples are padded/truncated
        (probe_model.config.max_position_embeddings).
    cpu_workers : int, optional
        How many worker processes to give 🤗 Datasets for .map()

    Returns
    -------
    embedding_tensor : torch.FloatTensor  (shape = probe hidden_dim × probe hidden_dim)
    lm_loss          : float | None
    """

    # ----- Tokenizer --------------------------------------------------------
    tok = AutoTokenizer.from_pretrained(solver_model_id, trust_remote_code=True)
    if tok.pad_token is None:                       # ensure a pad token exists
        tok.add_special_tokens({'pad_token': '[PAD]'})

    probe_model.resize_token_embeddings(len(tok))   # probe needs same vocab size

    # ----- Tokenize dataset -------------------------------------------------
    tok_ds = raw_dataset.map(
        lambda b: tok(
            b["text"],
            padding="longest",
            truncation=True,
            max_length=max_len,
        ),
        batched=True,
        batch_size=20,
        remove_columns=raw_dataset.column_names,
        num_proc=cpu_workers,
    )
    tok_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

    # ----- Task2Vec ---------------------------------------------------------
    task2vec = Task2Vec(
        probe_model,
        max_samples=1024,
        loader_opts={"batch_size": 1, "shuffle": True, "num_workers": 0},
    )
    task2vec.loss_fn = make_loss_fn(tok.pad_token_id)

    emb, lm_loss = task2vec.embed(tok_ds, epochs=epochs)

    # Convert to torch tensor for downstream use
    embedding_tensor = torch.from_numpy(emb.hessian).to(dtype=torch.float32)

    # Explicitly free large objects to keep memory usage down
    del tok, tok_ds, task2vec, emb
    gc.collect()
    torch.cuda.empty_cache()  # uncomment if you are using CUDA

    return embedding_tensor, lm_loss


In [12]:
pip install --upgrade huggingface_hub

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
import os
os.system("huggingface-cli login --token TOKEN_HERE")

0

In [14]:
import os
import gc
import torch
import warnings
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
)
from task2vec import Task2Vec
from itertools import combinations
from torch.utils.data import DataLoader
from functools import lru_cache
from typing import Callable, Tuple
from torch import Tensor
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean



warnings.filterwarnings("ignore", module="torch")

# ----- Global Configuration & Models -----------------------------------------
print(">>> Loading and preparing dataset...")
raw_ds = load_dataset("Tonic/MiniF2F", split="train")
raw_ds = raw_ds.select(range(1)) # Using a smaller subset for demonstration
raw_ds = raw_ds.map(
    lambda ex: {"text": "\n".join(p for p in (ex["informal_prefix"], ex["formal_statement"], ex["goal"]) if p)},
    num_proc=os.cpu_count(),
)
print(">>> Dataset ready.")


# Define the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Using device: {device} ---")

print(">>> Loading probe model...")
# probe_model_id = "distilbert/distilgpt2"
probe_model_id = "Saisam/gpt-neo-math-small"
probe_cfg = AutoConfig.from_pretrained(probe_model_id, trust_remote_code=True)
probe_cfg.attn_implementation = "sdpa"
probe_model = AutoModelForCausalLM.from_pretrained(
    probe_model_id,
    config=probe_cfg,
    trust_remote_code=True,
)

# Move the model to the defined device (GPU)
probe_model.to(device)

probe_model.gradient_checkpointing_enable()
max_probe_length = probe_model.config.max_position_embeddings
print(f">>> Probe model '{probe_model_id}' loaded with max length: {max_probe_length}.")

# ----- Helper & Core Functions -----------------------------------------------

@lru_cache(maxsize=128)
def compute_cmdiv(
    model_id1: str,
    model_id2: str,
    dataset_fingerprint: str,
    batch_size: int
) -> float:
    """
    Computes cmdiv by averaging embedding distances over batches.
    The `align_fn` is called on each pair of flattened embeddings
    and should return the two tensors to compare.
    """
    print(f"\n>>> Computing cmdiv for pair: ({model_id1}, {model_id2})")
    loader = DataLoader(raw_ds, batch_size=batch_size)
    distances = []

    for i, batch in enumerate(loader, start=1):
        print(f"    ... processing batch {i}/{len(loader)}")
        batch_ds = Dataset.from_dict(batch)

        emb1, _ = embed_with_solver_tokenizer(
            model_id1, batch_ds, probe_model, max_probe_length
        )
        emb2, _ = embed_with_solver_tokenizer(
            model_id2, batch_ds, probe_model, max_probe_length
        )

        a = emb1.flatten()
        b = emb2.flatten()

        dtw_dist, path = fastdtw(
            a.numpy(),
            b.numpy(),
            dist=lambda x, y: abs(x - y)
        )

        # 3) normalize by path length to get per-step average
        avg_diff = dtw_dist / len(path)

        # 4) turn into a [0,1] similarity if you like
        sim_dtw = 1.0 / (1.0 + avg_diff)

        print(f"dist between {model_id1} and {model_id2}: {sim_dtw}")
        # distance
        dist = 1.0 - sim_dtw
        distances.append(dist)

    avg_distance = sum(distances) / len(distances) if distances else 0.0
    print(f">>> Average distance (cmdiv) for pair = {avg_distance:.4f}")
    return avg_distance

def compute_edc(model_ids: set, dataset: Dataset, batch_size: int) -> float:
    """
    Computes the Ensemble Diversity Coefficient (EDC) for a set of models.
    """
    if len(model_ids) < 2:
        return 0.0

    dataset_fingerprint = dataset.info.builder_name
    pairwise_diversities = []

    for m_id1, m_id2 in combinations(list(model_ids), 2):
        diversity = compute_cmdiv(m_id1, m_id2, dataset_fingerprint, batch_size)
        print(f"Diversity between {m_id1} and {m_id2} on {dataset_fingerprint} with batch size {batch_size}: {diversity}")
        pairwise_diversities.append(diversity)

    return sum(pairwise_diversities) / len(pairwise_diversities) if pairwise_diversities else 0.0

# ----- Main Execution --------------------------------------------------------

test_model_ids = {
    "deepseek-ai/DeepSeek-Prover-V1",
    "EleutherAI/llemma_7b"
}

batch_size = 1 # smaller batch size for demonstration

# print("\n=====================================================================")
# print(">>> Calculating Ensemble Diversity Coefficient (EDC)...")
# print(f">>> Models in ensemble: {solver_model_ids}")
# print(f">>> Batch Size: {batch_size}")
# print("=====================================================================")

# ensemble_diversity_coefficient = compute_edc(test_model_ids, raw_ds, batch_size)

# print("\n=====================================================================")
# print(f"\nFinal Ensemble Diversity Coefficient (EDC): {ensemble_diversity_coefficient:.4f}")
# print("=====================================================================")

>>> Loading and preparing dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

>>> Dataset ready.
--- Using device: cuda ---
>>> Loading probe model...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/551M [00:00<?, ?B/s]

>>> Probe model 'Saisam/gpt-neo-math-small' loaded with max length: 2048.


In [15]:
from itertools import combinations
from typing import List, Tuple

solver_model_ids = {
    "deepseek-ai/DeepSeek-Prover-V2-7B",
    "Goedel-LM/Goedel-Prover-SFT",
    "stoney0062/Leanabell-Prover-GD-RL",
    # "kaiyuy/leandojo-lean4-retriever-tacgen-byt5-small",
    "stoney0062/Leanabell-Prover-DS-SFT",
    "deepseek-ai/DeepSeek-Prover-V1.5-RL",
    "AI-MO/Kimina-Prover-Preview-Distill-7B",
    "kfdong/STP_model_Lean",
    "wellecks/llmstep-mathlib4-pythia2.8b",
    "internlm/internlm2_5-step-prover",
    "RickyDeSkywalker/TheoremLlama",
    "ByteDance-Seed/BFS-Prover",
    "ScalableMath/Lean-STaR-plus",
    "deepseek-ai/DeepSeek-Prover-V1",
    "EleutherAI/llemma_7b"
}

def get_all_combinations_of_length(s: set, length: int) -> List[Tuple]:
    return list(combinations(s, length))

solver_model_ensembles_3 = get_all_combinations_of_length(solver_model_ids, 3)
# assert(len(solver_model_ids) == 15)
# assert(len(solver_model_ensembles_3) == 455)
solver_model_ensembles_3[0]

('ScalableMath/Lean-STaR-plus',
 'stoney0062/Leanabell-Prover-GD-RL',
 'deepseek-ai/DeepSeek-Prover-V2-7B')

In [16]:
from transformers import pipeline, AutoConfig, AutoTokenizer, AutoModelForCausalLM
import logging

_MODEL_CACHE = {}

def _load_model(model_id: str):
    """
    Loads (or reuses) a single model+tokenizer on GPU, with a fix for rope_scaling.
    Caches the result so repeated calls don’t re-download or re-to("cuda").
    """
    # 1) if we've already loaded it, reuse
    if model_id in _MODEL_CACHE:
        print(f"Reusing cached model: {model_id}")
        return _MODEL_CACHE[model_id]

    # 2) otherwise load it fresh
    print(f"Attempting to load model: {model_id}")
    try:
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

        # fix rope_scaling int→float
        if getattr(config, "rope_scaling", None):
            for k in ("factor", "beta_fast", "beta_slow"):
                v = config.rope_scaling.get(k)
                if isinstance(v, int):
                    print(f"Fixing rope_scaling {k} type from int to float…")
                    config.rope_scaling[k] = float(v)

        tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = (
            AutoModelForCausalLM
            .from_pretrained(model_id, config=config, torch_dtype="auto", trust_remote_code=True)
            .to("cuda")
        )

        # 3) stash in cache
        _MODEL_CACHE[model_id] = (model, tok)
        print(f"Successfully loaded and cached {model_id}")
        return model, tok

    except Exception as e:
        logging.error(f"❌ Failed to load model '{model_id}': {e}")
        return None, None


def generate_proof(pipe, informal_prefix, formal_statement, header,
                   temperature: float = 0.7, max_new_tokens: int = 512,
                   num_return_sequences: int = 1):
    prompt = f"{header}\n\n{informal_prefix}\n\n{formal_statement}\n\nproof\n"

    out = pipe(prompt,
               do_sample=True,
               temperature=temperature,
               max_new_tokens=max_new_tokens,
               eos_token_id=pipe.tokenizer.eos_token_id,
               num_return_sequences=num_return_sequences)

    proofs = [result['generated_text'][len(prompt):].strip() for result in out]
    return proofs

In [17]:
!pip install tqdm

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [18]:
import torch
import gc
from transformers import pipeline
import os

# Set the environment variable to disable the warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ---------------------------------------------------------------------------
# 2.  Ensemble helpers
# ---------------------------------------------------------------------------
import math

from tqdm.auto import tqdm   # auto picks the right notebook/CLI backend
import torch, gc

def solves_problem_memory_safe(
    model_ids,
    problem_row,
    max_attempts: int,
    gpu_batch_size: int = 16,      # <-- New parameter for GPU batch size
    parallel_workers: int = None,
    dealloc: bool = True
):
    if parallel_workers is None:
        parallel_workers = gpu_batch_size

    """
    VERSION 8: With controllable GPU batching + tqdm progress bars.
    """

    # Outer progress bar over the list of models
    for model_id in tqdm(model_ids, desc="Models"):
        model, tok, pipe = None, None, None
        attempt_bar = None                 # will hold per-model tqdm instance
        try:
            model, tok = _load_model(model_id)
            pipe = pipeline("text-generation", model=model, tokenizer=tok, device=0)

            # Track remaining attempts for this model and set up inner bar
            attempts_left = max_attempts
            attempt_bar = tqdm(
                total=max_attempts,
                desc=f"Attempts {model_id}",
                leave=False
            )

            while attempts_left > 0:
                current_batch_size = min(gpu_batch_size, attempts_left)

                with torch.no_grad():
                    proof_snippets = generate_proof(
                        pipe,
                        informal_prefix=problem_row['informal_prefix'],
                        formal_statement=problem_row['formal_statement'],
                        header=problem_row['header'],
                        num_return_sequences=current_batch_size
                    )

                proof_contexts = [
                    {
                        "proof": snippet,
                        "header": problem_row['header'],
                        "formal_statement": problem_row['formal_statement'],
                        "project_dir": lean_project_path
                    }
                    for snippet in proof_snippets
                ]

                if check_proofs_in_parallel(
                    proof_contexts,
                    parallel_workers=parallel_workers
                ):
                    return True  # solved!

                attempts_left -= current_batch_size
                attempt_bar.update(current_batch_size)

        finally:
            if attempt_bar is not None:
                attempt_bar.close()

            if dealloc:
                # Evict from global cache instead of just deleting locals
                for mid in model_ids:
                    cached = _MODEL_CACHE.pop(mid, None)
                    if cached:
                        m, t = cached
                        del m, t
                        # if you cached the pipeline as well, del that here too
                        break

                # Clean up whatever’s left and free the GPU
                del pipe
                gc.collect()
                torch.cuda.empty_cache()

    return False  # No solution found after all models

def ensemble_accuracy_on_dataset(model_ids, dataframe, max_attempts_per_model=32):
    """Evaluates the full dataset, managing memory correctly."""
    num_solved = 0
    for _, row in dataframe.iterrows():
        if solves_problem_memory_safe(model_ids, row, max_attempts_per_model):
            num_solved += 1
    return num_solved, len(dataframe)

# ---------------------------------------------------------------------------
# 3.  Dataset: get the test split and convert to DataFrame
# ---------------------------------------------------------------------------
from datasets import load_dataset
import pandas as pd

miniF2F_train_df = load_dataset("Tonic/MiniF2F", split="train").to_pandas()

In [19]:

# # print all rows
# pd.set_option('display.max_colwidth', None)
# miniF2F_train_df.iloc[42]

In [20]:
# # ---------------------------------------------------------------------------
# # 4.  Example run on the first test problem with your first ensemble
# # ---------------------------------------------------------------------------
# example_row     = miniF2F_train_df.iloc[42]
# ensemble_models = ["AI-MO/Kimina-Prover-Preview-Distill-7B"]

# success = solves_problem_memory_safe(ensemble_models, example_row, max_attempts=1024, gpu_batch_size=1)
# print("Solved?", success)

In [21]:
# miniF2F_train_df = load_dataset("Tonic/MiniF2F", split="train").to_pandas()

# # 3. Call the function with the models and the full DataFrame
# print("🚀 Starting evaluation on the entire dataset...")
# num_solved, total_problems = ensemble_accuracy_on_dataset(
#     ensemble_models,
#     miniF2F_train_df.head(5)
# )

# # 4. Print the final results
# accuracy = (num_solved / total_problems) * 100
# print("\n--- Evaluation Complete ---")
# print(f"Problems Solved: {num_solved}")
# print(f"Total Problems:  {total_problems}")
# print(f"Ensemble Accuracy: {accuracy:.2f}%")
!pip install einops

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [22]:
import pandas as pd
from tqdm.auto import tqdm
import gc
import torch
from transformers import pipeline
from datasets import load_dataset

def get_solved_problems_for_model(model_id: str, dataframe: pd.DataFrame, max_attempts: int = 8, gpu_batch_size: int = 8, parallel_workers: int = 8) -> set:
    """
    Evaluates a single model on a dataset by calling a memory-safe helper for each problem.
    Returns the set of indices of solved problems.
    """
    print(f"--- Evaluating model: {model_id} ---")
    solved_indices = set()

    # Iterate over each problem in the dataframe with a progress bar
    for index, problem_row in tqdm(dataframe.iterrows(), total=len(dataframe), desc=f"Solving for {model_id}"):

        dealloc = index == dataframe.index[-1]

        # The solves_problem_memory_safe function expects a list of model_ids.
        # We provide a list containing only the current model_id to check if it can solve this single problem.
        # The helper handles all model loading, attempts, and GPU memory cleanup internally.
        if solves_problem_memory_safe(
            model_ids=[model_id],
            problem_row=problem_row,
            max_attempts=max_attempts,
            gpu_batch_size=gpu_batch_size,
            parallel_workers=parallel_workers,
            dealloc=dealloc
        ):
            solved_indices.add(index)

    print(f"--- Model {model_id} solved {len(solved_indices)} of {len(dataframe)} problems. ---")
    return solved_indices
# --- Main Pre-computation Step ---
print("🚀 Starting Pre-computation Step...")

# Load the test dataset
# Using the test set is conventional for final evaluation.
miniF2F_train_df = load_dataset("Tonic/MiniF2F", split="train").to_pandas()

# For demonstration, let's select a few models and a small slice of the data.
# ❗ For a full run, use the full 'solver_model_ids' and 'miniF2F_train_df'.
demo_models = list(solver_model_ids)[5:8]
demo_dataset = miniF2F_train_df[15:21] # problems for demo

solved_problems_map = {}
for model_id in demo_models:
    solved_set = get_solved_problems_for_model(model_id, demo_dataset)
    solved_problems_map[model_id] = solved_set

print("\n✅ --- Pre-computation Complete ---")
print("Solved problems map (problem indices solved by each model):")
for model, solved_set in solved_problems_map.items():
    print(f"  - {model}: Solved {len(solved_set)} problems. Indices: {list(solved_set)}")

🚀 Starting Pre-computation Step...
--- Evaluating model: AI-MO/Kimina-Prover-Preview-Distill-7B ---


Solving for AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/6 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Attempting to load model: AI-MO/Kimina-Prover-Preview-Distill-7B


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Successfully loaded and cached AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: AI-MO/Kimina-Prover-Preview-Distill-7B


Attempts AI-MO/Kimina-Prover-Preview-Distill-7B:   0%|          | 0/8 [00:00<?, ?it/s]

--- Model AI-MO/Kimina-Prover-Preview-Distill-7B solved 0 of 6 problems. ---
--- Evaluating model: stoney0062/Leanabell-Prover-DS-SFT ---


Solving for stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/6 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Attempting to load model: stoney0062/Leanabell-Prover-DS-SFT


config.json:   0%|          | 0.00/934 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Device set to use cuda:0


Successfully loaded and cached stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: stoney0062/Leanabell-Prover-DS-SFT


Attempts stoney0062/Leanabell-Prover-DS-SFT:   0%|          | 0/8 [00:00<?, ?it/s]

❌ Proof is incorrect or contains errors.

--- Lean Output (stdout) ---
temp_proof.lean:14:103: error: unsolved goals
case h
S : Finset ℕ
h₀ : ∀ (x : ℕ), x ∈ S ↔ 0 < x ∧ x < 1000 ∧ x.divisors.card = 3
x : ℕ
⊢ 0 < x → x < 1000 → x.divisors.card = 3 → x < 1000
temp_proof.lean:20:2: error: maximum recursion depth has been reached
use `set_option maxRecDepth <num>` to increase limit
use `set_option diagnostics true` to get diagnostic information
temp_proof.lean:30:0: error: unexpected identifier; expected command
temp_proof.lean:32:73: error: unexpected token; expected ':'
temp_proof.lean:36:0: error: Function expected at
  1.
but this term has type
  ?m.466

Note: Expected a function because this term is being applied to the argument
  The
temp_proof.lean:36:0: error: invalid resulting type, expecting 'Type _' or 'Prop'
temp_proof.lean:36:20: error: unexpected token 'by'; expected command

--- Full Code ---
import Mathlib
import Aesop

set_option maxHeartbeats 0

open BigOperators Real Nat

Solving for kfdong/STP_model_Lean:   0%|          | 0/6 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Attempting to load model: kfdong/STP_model_Lean


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


Successfully loaded and cached kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

✅ Proof is correct!
--- Full Code ---
import Mathlib
import Aesop

set_option maxHeartbeats 0

open BigOperators Real Nat Topology Rat



theorem mathd_algebra_462 : ((1 : ℚ) / 2 + 1 / 3) * (1 / 2 - 1 / 3) = 5 / 36 := by

  /-
  To evaluate the expression \(\left( \frac{1}{2} + \frac{1}{3} \right) \left( \frac{1}{2} - \frac{1}{3} \right)\), we start by simplifying the terms inside the parentheses. 
  First, we find a common denominator for the fractions inside the parentheses:
  \[
  \frac{1}{2} + \frac{1}{3} = \frac{3}{6} + \frac{2}{6} = \frac{5}{6}
  \]
  \[
  \frac{1}{2} - \frac{1}{3} = \frac{3}{6} - \frac{2}{6} = \frac{1}{6}
  \]
  Next, we multiply these simplified fractions:
  \[
  \left( \frac{5}{6} \right) \left( \frac{1}{6} \right) = \frac{5 \cdot 1}{6 \cdot 6} = \frac{5}{36}
  \]
  Thus, the value of the expression is \(\frac{5}{36}\).
  -/
  norm_num
  <;>
    aesop
  <;>
    aesop
  <;>
    aesop
  <;>
    aesop
  <;>
    aesop
**DONE**
✅ Proof is correct!
--- Full Code ---

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


Reusing cached model: kfdong/STP_model_Lean


Attempts kfdong/STP_model_Lean:   0%|          | 0/8 [00:00<?, ?it/s]

--- Model kfdong/STP_model_Lean solved 1 of 6 problems. ---

✅ --- Pre-computation Complete ---
Solved problems map (problem indices solved by each model):
  - AI-MO/Kimina-Prover-Preview-Distill-7B: Solved 0 problems. Indices: []
  - stoney0062/Leanabell-Prover-DS-SFT: Solved 0 problems. Indices: []
  - kfdong/STP_model_Lean: Solved 1 problems. Indices: [18]


In [23]:
# Insert this code into a new cell in your notebook

import numpy as np
from itertools import combinations

def calculate_psd(solved_set_A: set, solved_set_B: set) -> float:
    """
    Calculates the Problem-Success Diversity (PSD) between two models.
    This is the Jaccard distance on the sets of successfully solved problems.
    """
    # |A ∩ B|
    intersection_size = len(solved_set_A.intersection(solved_set_B))
    # |A ∪ B|
    union_size = len(solved_set_A.union(solved_set_B))

    if union_size == 0:
        return 0.0  # If neither model solves any problem, their diversity is 0.

    # Jaccard Similarity = |A ∩ B| / |A ∪ B|
    jaccard_similarity = intersection_size / union_size

    # psd = 1 - Jaccard Similarity
    return 1.0 - jaccard_similarity

def calculate_psed(ensemble_model_ids: tuple, solved_problems_map: dict) -> float:
    """
    Calculates the Problem-Success Ensemble Diversity (PSED) for an ensemble.
    This is the average PSD over all unique pairs of models in the ensemble.
    """
    if len(ensemble_model_ids) < 2:
        return 0.0

    pairwise_psd_scores = []
    # Create all unique pairs of models, e.g., (m_i, m_j) where i < j
    for model_a, model_b in combinations(ensemble_model_ids, 2):
        set_a = solved_problems_map.get(model_a, set())
        set_b = solved_problems_map.get(model_b, set())

        psd = calculate_psd(set_a, set_b)
        pairwise_psd_scores.append(psd)

    # Return the average of all pairwise scores
    return np.mean(pairwise_psd_scores) if pairwise_psd_scores else 0.0

# --- Example Usage ---
# We use the results from the pre-computation step
first_demo_ensemble = get_all_combinations_of_length(set(demo_models), 2)[0]
psed_score = calculate_psed(first_demo_ensemble, solved_problems_map)

print(f"Example Ensemble: {first_demo_ensemble}")
print(f"PSED Score: {psed_score:.4f}")

Example Ensemble: ('AI-MO/Kimina-Prover-Preview-Distill-7B', 'stoney0062/Leanabell-Prover-DS-SFT')
PSED Score: 0.0000


In [24]:
# Insert this code into a new cell in your notebook

# You may need to install statsmodels
!pip install statsmodels

import pandas as pd
import statsmodels.api as sm
from tqdm.auto import tqdm

# --- Step 1: Helper function for Baseline Accuracy (S_0) ---
def get_ensemble_baseline_accuracy(ensemble_model_ids: tuple, solved_problems_map: dict, total_problems: int) -> float:
    """
    Calculates the baseline accuracy (S_0) of an ensemble, defined as the
    proportion of problems solved by at least one member of the ensemble.
    """
    problems_solved_by_ensemble = set()
    for model_id in ensemble_model_ids:
        problems_solved_by_ensemble.update(solved_problems_map.get(model_id, set()))

    num_solved = len(problems_solved_by_ensemble)
    return num_solved / total_problems if total_problems > 0 else 0.0

# --- Step 2: Generate data points for regression ---
# ❗ Note: This step calls `compute_edc`, which is computationally expensive.
# To run on all 455 3-model ensembles, change `ensembles_to_process`.
ensembles_to_process = get_all_combinations_of_length(set(demo_models), 2)
total_problems_in_dataset = len(demo_dataset)

regression_data = []
for ensemble in tqdm(ensembles_to_process, desc="Generating Regression Data"):
    # Calculate PSED using our new function
    psed = calculate_psed(ensemble, solved_problems_map)

    # Calculate Baseline Accuracy (S_0)
    s0 = get_ensemble_baseline_accuracy(ensemble, solved_problems_map, total_problems_in_dataset)

    # Calculate EDC using your existing function `compute_edc`
    edc = compute_edc(set(ensemble), raw_ds, batch_size=1)

    regression_data.append({
        "ensemble": " | ".join(e.split('/')[1] for e in ensemble), # a shorter name for display
        "PSED": psed,
        "EDC": edc,
        "S_0": s0
    })

regression_df = pd.DataFrame(regression_data)
print("--- Generated Data for Regression ---")
print(regression_df)

# --- Step 3: Construct and Fit the Multiple Regression Model ---
print("\n--- 📊 Building Regression Model ---")

# We need at least two data points to run a regression
if len(regression_df) < len(regression_df.columns):
    print("Not enough data to run regression. Please use more ensembles.")
else:
    # Dependent variable (what we want to predict)
    y = regression_df['PSED']
    # Independent variables (what we use for prediction)
    X = regression_df[['EDC', 'S_0']]

    # Add a constant for the intercept term 'α'
    X = sm.add_constant(X)

    # Create and fit the Ordinary Least Squares (OLS) model
    model = sm.OLS(y, X).fit()

    # Print the detailed summary of the regression results
    print(model.summary())

    # --- Interpretation ---
    print("\n--- Model Interpretation ---")
    alpha = model.params['const']
    beta_1 = model.params['EDC']
    beta_2 = model.params['S_0']
    p_value_beta_1 = model.pvalues['EDC']

    print(f"The fitted model equation is: PSED = {alpha:.4f} + ({beta_1:.4f} * EDC) + ({beta_2:.4f} * S_0)")
    print(f"\nThe coefficient for EDC (β1) is {beta_1:.4f}.")
    print(f"The p-value for this coefficient is {p_value_beta_1:.4f}.")

    if p_value_beta_1 < 0.05:
        print("This suggests that β1 is statistically significant. After accounting for baseline accuracy (S_0), EDC has a significant relationship with PSED.")
    else:
        print("This suggests that β1 is not statistically significant. We cannot conclude that EDC has a relationship with PSED after accounting for baseline accuracy (S_0).")

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

Generating Regression Data:   0%|          | 0/3 [00:00<?, ?it/s]


>>> Computing cmdiv for pair: (AI-MO/Kimina-Prover-Preview-Distill-7B, stoney0062/Leanabell-Prover-DS-SFT)
    ... processing batch 1/1


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...



Initial loss 9.162514686584473 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 9.162514686584473 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=9.162514686584473 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 8.751026153564453 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 8.751026153564453 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=8.751026153564453 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between AI-MO/Kimina-Prover-Preview-Distill-7B and stoney0062/Leanabell-Prover-DS-SFT: 0.9933087657443141
>>> Average distance (cmdiv) for pair = 0.0067
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and stoney0062/Leanabell-Prover-DS-SFT on json with batch size 1: 0.006691234255685852

>>> Computing cmdiv for pair: (AI-MO/Kimina-Prover-Preview-Distill-7B, kfdong/STP_model_Lean)
    ... processing batch 1/1


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 8.994993209838867 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 8.994993209838867 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=8.994993209838867 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 8.589489936828613 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 8.589489936828613 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=8.589489936828613 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between AI-MO/Kimina-Prover-Preview-Distill-7B and kfdong/STP_model_Lean: 0.993769172992233
>>> Average distance (cmdiv) for pair = 0.0062
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and kfdong/STP_model_Lean on json with batch size 1: 0.006230827007767004

>>> Computing cmdiv for pair: (stoney0062/Leanabell-Prover-DS-SFT, kfdong/STP_model_Lean)
    ... processing batch 1/1


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 8.4425048828125 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 8.4425048828125 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=8.4425048828125 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 8.299046516418457 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 8.299046516418457 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=8.299046516418457 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between stoney0062/Leanabell-Prover-DS-SFT and kfdong/STP_model_Lean: 0.9958719952873593
>>> Average distance (cmdiv) for pair = 0.0041
Diversity between stoney0062/Leanabell-Prover-DS-SFT and kfdong/STP_model_Lean on json with batch size 1: 0.004128004712640743
--- Generated Data for Regression ---
                                            ensemble  PSED       EDC       S_0
0  Kimina-Prover-Preview-Distill-7B | Leanabell-P...   0.0  0.006691  0.000000
1  Kimina-Prover-Preview-Distill-7B | STP_model_Lean   1.0  0.006231  0.166667
2           Leanabell-Prover-DS-SFT | STP_model_Lean   1.0  0.004128  0.166667

--- 📊 Building Regression Model ---
Not enough data to run regression. Please use more ensembles.
