In [None]:
!git clone https://github.com/alycialee/beyond-scale-language-data-diversity.git
%cd beyond-scale-language-data-diversity

# 2️⃣  Make sure the build tools are modern enough
# %pip install --quiet --upgrade pip setuptools wheel

%pip install pip==24.0

# 3️⃣  Editable-install *into the live kernel*  ← note the %pip magic
%pip install -e .

!pip install -U datasets
!pip install fastdtw

In [2]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd() / "src"))
sys.path.insert(0, str(pathlib.Path.cwd() / "src" / "diversity"))

In [6]:
def make_loss_fn(ignore_id):
    """Factory function to create a cross-entropy loss function."""
    def _loss_fn(logits, tgt, *_, **__):
        logits = logits[:, :-1, :].contiguous()
        tgt = tgt[:, 1:].contiguous()
        return F.cross_entropy(logits.view(-1, logits.size(-1)), tgt.view(-1), ignore_index=ignore_id)
    return _loss_fn

In [7]:
import os
# ---------------------------------------------------------------------------
# Helper: one call = one embedding/LM-loss for a given solver tokenizer
# ---------------------------------------------------------------------------
def embed_with_solver_tokenizer(
    solver_model_id: str,
    raw_dataset,
    probe_model,
    max_len: int,
    epochs: int = 1,
    cpu_workers: int = os.cpu_count(),
):
    """
    Parameters
    ----------
    solver_model_id : str
        Hugging Face model ID from which to load the tokenizer.
    raw_dataset : datasets.Dataset
        The *raw_ds* already built earlier in the script.
    probe_model : transformers.PreTrainedModel
        The already-constructed probe (distilgpt2 here).
    max_len : int
        Sequence length to which examples are padded/truncated
        (probe_model.config.max_position_embeddings).
    cpu_workers : int, optional
        How many worker processes to give 🤗 Datasets for .map()

    Returns
    -------
    embedding_tensor : torch.FloatTensor  (shape = probe hidden_dim × probe hidden_dim)
    lm_loss          : float | None
    """
    # ----- Tokenizer --------------------------------------------------------
    tok = AutoTokenizer.from_pretrained(solver_model_id, trust_remote_code=True)
    if tok.pad_token is None:                       # ensure a pad token exists
        tok.add_special_tokens({'pad_token': '[PAD]'})

    probe_model.resize_token_embeddings(len(tok))   # probe needs same vocab size

    # ----- Tokenize dataset -------------------------------------------------
    tok_ds = raw_dataset.map(
        lambda b: tok(
            b["text"],
            padding="longest",
            truncation=True,
            max_length=max_len,
        ),
        batched=True,
        batch_size=20,
        remove_columns=raw_dataset.column_names,
        num_proc=cpu_workers,
    )
    tok_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

    # ----- Task2Vec ---------------------------------------------------------
    task2vec = Task2Vec(
        probe_model,
        max_samples=1024,
        loader_opts={"batch_size": 1, "shuffle": True, "num_workers": 0},
    )
    task2vec.loss_fn = make_loss_fn(tok.pad_token_id)

    emb, lm_loss = task2vec.embed(tok_ds, epochs=epochs)

    # Convert to torch tensor for downstream use
    embedding_tensor = torch.from_numpy(emb.hessian).to(dtype=torch.float32)

    # Explicitly free large objects to keep memory usage down
    del tok, tok_ds, task2vec, emb
    gc.collect()
    # torch.cuda.empty_cache()  # uncomment if you are using CUDA

    return embedding_tensor, lm_loss


In [8]:
import os
import gc
import torch
import warnings
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
)
from task2vec import Task2Vec
from itertools import combinations
from torch.utils.data import DataLoader
from functools import lru_cache
from typing import Callable, Tuple
from torch import Tensor
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean



warnings.filterwarnings("ignore", module="torch")

# ----- Global Configuration & Models -----------------------------------------
print(">>> Loading and preparing dataset...")
raw_ds = load_dataset("Tonic/MiniF2F", split="train")
raw_ds = raw_ds.select(range(1)) # Using a smaller subset for demonstration
raw_ds = raw_ds.map(
    lambda ex: {"text": "\n".join(p for p in (ex["informal_prefix"], ex["formal_statement"], ex["goal"]) if p)},
    num_proc=os.cpu_count(),
)
print(">>> Dataset ready.")

print(">>> Loading probe model...")
probe_model_id = "distilbert/distilgpt2"
probe_cfg = AutoConfig.from_pretrained(probe_model_id, trust_remote_code=True)
probe_cfg.attn_implementation = "sdpa"
probe_model = AutoModelForCausalLM.from_pretrained(
    probe_model_id,
    config=probe_cfg,
    trust_remote_code=True,
)
probe_model.gradient_checkpointing_enable()
max_probe_length = probe_model.config.max_position_embeddings
print(f">>> Probe model '{probe_model_id}' loaded with max length: {max_probe_length}.")

# ----- Helper & Core Functions -----------------------------------------------

# A default “truncation to min length” function
def truncation_align(
    vec_a: Tensor,
    vec_b: Tensor,
) -> Tuple[Tensor, Tensor]:
    if vec_a.numel() != vec_b.numel():
        min_len = min(vec_a.numel(), vec_b.numel())
        vec_a = vec_a[:min_len]
        vec_b = vec_b[:min_len]
    return vec_a, vec_b

@lru_cache(maxsize=128)
def compute_cmdiv(
    model_id1: str,
    model_id2: str,
    dataset_fingerprint: str,
    batch_size: int,
    align_fn: Callable[[Tensor, Tensor], Tuple[Tensor, Tensor]] = truncation_align,
) -> float:
    """
    Computes cmdiv by averaging embedding distances over batches.
    The `align_fn` is called on each pair of flattened embeddings
    and should return the two tensors to compare.
    """
    print(f"\n>>> Computing cmdiv for pair: ({model_id1}, {model_id2})")
    loader = DataLoader(raw_ds, batch_size=batch_size)
    distances = []

    for i, batch in enumerate(loader, start=1):
        print(f"    ... processing batch {i}/{len(loader)}")
        batch_ds = Dataset.from_dict(batch)

        emb1, _ = embed_with_solver_tokenizer(
            model_id1, batch_ds, probe_model, max_probe_length
        )
        emb2, _ = embed_with_solver_tokenizer(
            model_id2, batch_ds, probe_model, max_probe_length
        )

        a = emb1.flatten()
        b = emb2.flatten()

        dtw_dist, path = fastdtw(
            a.numpy(),
            b.numpy(),
            dist=lambda x, y: abs(x - y)
        )

        # 3) normalize by path length to get per-step average
        avg_diff = dtw_dist / len(path)

        # 4) turn into a [0,1] similarity if you like
        sim_dtw = 1.0 / (1.0 + avg_diff)

        print(f"dist between {model_id1} and {model_id2}: {sim_dtw}")
        # distance
        dist = 1.0 - sim_dtw
        distances.append(dist)

    avg_distance = sum(distances) / len(distances) if distances else 0.0
    print(f">>> Average distance (cmdiv) for pair = {avg_distance:.4f}")
    return avg_distance

def compute_edc(model_ids: set, dataset: Dataset, batch_size: int) -> float:
    """
    Computes the Ensemble Diversity Coefficient (EDC) for a set of models.
    """
    if len(model_ids) < 2:
        return 0.0

    dataset_fingerprint = dataset.info.builder_name
    pairwise_diversities = []

    for m_id1, m_id2 in combinations(sorted(list(model_ids)), 2):
        diversity = compute_cmdiv(m_id1, m_id2, dataset_fingerprint, batch_size)
        print(f"Diversity between {m_id1} and {m_id2} on {dataset_fingerprint} with batch size {batch_size}: {diversity}")
        pairwise_diversities.append(diversity)

    return sum(pairwise_diversities) / len(pairwise_diversities) if pairwise_diversities else 0.0

# ----- Main Execution --------------------------------------------------------

solver_model_ids = {
    "AI-MO/Kimina-Prover-Preview-Distill-7B",
    "deepseek-ai/DeepSeek-Prover-V2-7B",
    "deepseek-ai/DeepSeek-Prover-V1.5-RL",
}
batch_size = 8 # smaller batch size for demonstration

print("\n=====================================================================")
print(">>> Calculating Ensemble Diversity Coefficient (EDC)...")
print(f">>> Models in ensemble: {solver_model_ids}")
print(f">>> Batch Size: {batch_size}")
print("=====================================================================")

ensemble_diversity_coefficient = compute_edc(solver_model_ids, raw_ds, batch_size)

print("\n=====================================================================")
print(f"\nFinal Ensemble Diversity Coefficient (EDC): {ensemble_diversity_coefficient:.4f}")
print("=====================================================================")

>>> Loading and preparing dataset...


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

>>> Dataset ready.
>>> Loading probe model...
>>> Probe model 'distilbert/distilgpt2' loaded with max length: 1024.

>>> Calculating Ensemble Diversity Coefficient (EDC)...
>>> Models in ensemble: {'AI-MO/Kimina-Prover-Preview-Distill-7B', 'deepseek-ai/DeepSeek-Prover-V1.5-RL', 'deepseek-ai/DeepSeek-Prover-V2-7B'}
>>> Batch Size: 8

>>> Computing cmdiv for pair: (AI-MO/Kimina-Prover-Preview-Distill-7B, deepseek-ai/DeepSeek-Prover-V1.5-RL)
    ... processing batch 1/1


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.



Initial loss 6.510787010192871 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 6.510787010192871 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=6.510787010192871 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 7.263754844665527 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 7.263754844665527 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=7.263754844665527 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V1.5-RL: 0.9986459228655725
>>> Average distance (cmdiv) for pair = 0.0014
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V1.5-RL on json with batch size 8: 0.0013540771344274782

>>> Computing cmdiv for pair: (AI-MO/Kimina-Prover-Preview-Distill-7B, deepseek-ai/DeepSeek-Prover-V2-7B)
    ... processing batch 1/1


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 6.47876501083374 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 6.47876501083374 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=6.47876501083374 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 7.232361793518066 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 7.232361793518066 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=7.232361793518066 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V2-7B: 0.9985442788082508
>>> Average distance (cmdiv) for pair = 0.0015
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V2-7B on json with batch size 8: 0.001455721191749193

>>> Computing cmdiv for pair: (deepseek-ai/DeepSeek-Prover-V1.5-RL, deepseek-ai/DeepSeek-Prover-V2-7B)
    ... processing batch 1/1


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 7.204761505126953 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 7.204761505126953 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=7.204761505126953 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cpu


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]


Initial loss 7.177384376525879 (step=0 epoch=0)

final loss step=0 epoch=0 of final layer loss 7.177384376525879 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=7.177384376525879 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

dist between deepseek-ai/DeepSeek-Prover-V1.5-RL and deepseek-ai/DeepSeek-Prover-V2-7B: 0.9996035259991356
>>> Average distance (cmdiv) for pair = 0.0004
Diversity between deepseek-ai/DeepSeek-Prover-V1.5-RL and deepseek-ai/DeepSeek-Prover-V2-7B on json with batch size 8: 0.00039647400086439966


Final Ensemble Diversity Coefficient (EDC): 0.0011
