In [1]:
!git clone https://github.com/alycialee/beyond-scale-language-data-diversity.git
%cd beyond-scale-language-data-diversity

%pip install pip==24.0

# 3️⃣  Editable-install *into the live kernel*  ← note the %pip magic
%pip install -e .

fatal: destination path 'beyond-scale-language-data-diversity' already exists and is not an empty directory.
/content/beyond-scale-language-data-diversity
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mObtaining file:///content/beyond-scale-language-data-diversity
  Preparing metadata (setup.py) ... [?25l[?25hdone
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://git

In [2]:
# !pip install -U datasets
# !pip install fastdtw
# !pip install tf-keras

In [3]:
# !pip install --upgrade huggingface_hub
# !pip install --upgrade datasets
# import os
# os.system("huggingface-cli login --token TOKEN")

In [4]:
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd() / "src"))
sys.path.insert(0, str(pathlib.Path.cwd() / "src" / "diversity"))

In [5]:
def make_loss_fn(ignore_id):
    """Factory function to create a cross-entropy loss function."""
    def _loss_fn(logits, tgt, *_, **__):
        logits = logits[:, :-1, :].contiguous()
        tgt = tgt[:, 1:].contiguous()
        return F.cross_entropy(logits.view(-1, logits.size(-1)), tgt.view(-1), ignore_index=ignore_id)
    return _loss_fn

In [6]:
import os
from functools import lru_cache
# ---------------------------------------------------------------------------
# Helper: one call = one embedding/LM-loss for a given solver tokenizer
# ---------------------------------------------------------------------------

embeddings = {}

@lru_cache(maxsize=None)
def embed_with_solver_tokenizer(
    solver_model_id: str,
    raw_dataset,
    probe_model,
    max_len: int,
    epochs: int = 1,
    cpu_workers: int = os.cpu_count(),
):
    # ----- Tokenizer --------------------------------------------------------
    tok = AutoTokenizer.from_pretrained(solver_model_id, trust_remote_code=True)
    if tok.pad_token is None:
        tok.add_special_tokens({"pad_token": "[PAD]"})

    probe_model.resize_token_embeddings(len(tok))

    # ----- Tokenize + **pad to max_len** ------------------------------------
    tok_ds = raw_dataset.map(
        lambda batch: tok(
            batch["text"],
            padding="max_length",        # <‑‑ THIS LINE is the change
            truncation=True,
            max_length=max_len,
            return_attention_mask=True,
        ),
        batched=True,
        batch_size=20,
        remove_columns=raw_dataset.column_names,
        num_proc=cpu_workers,
    )
    # After the map every row has exactly `max_len` tokens.
    tok_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

    # ----- Task2Vec ---------------------------------------------------------
    task2vec = Task2Vec(
        probe_model,
        max_samples=1024,
        loader_opts={"batch_size": 3, "shuffle": True, "num_workers": 8},
    )
    task2vec.loss_fn = make_loss_fn(tok.pad_token_id)

    emb, lm_loss = task2vec.embed(tok_ds, epochs=epochs)

    embedding_tensor = torch.from_numpy(emb.hessian).to(dtype=torch.float32)

    # clean‑up
    del tok, tok_ds, task2vec, emb
    gc.collect()
    # torch.cuda.empty_cache()

    embeddings[solver_model_id] = embedding_tensor
    return embedding_tensor, lm_loss

In [7]:
import os
import gc
import torch
import warnings
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
)
from task2vec import Task2Vec
from itertools import combinations
from torch.utils.data import DataLoader
from functools import lru_cache
from typing import Callable, Tuple
from torch import Tensor
# from fastdtw import fastdtw
# from scipy.spatial.distance import euclidean



warnings.filterwarnings("ignore", module="torch")

# ----- Global Configuration & Models -----------------------------------------
print(">>> Loading and preparing dataset...")
# raw_ds = load_dataset("Tonic/MiniF2F", split="train")
raw_ds = load_dataset("AI-MO/minif2f_test", split="train")
# raw_ds = raw_ds.select(range(40)) # Using a smaller subset for demonstration
raw_ds = raw_ds.map(
    lambda ex: {"text": "\n".join(p for p in (ex["formal_statement"]) if p)},
    num_proc=os.cpu_count(),
)
print(">>> Dataset ready.")


# Define the device to use (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Using device: {device} ---")

print(">>> Loading probe model...")
# probe_model_id = "distilbert/distilgpt2"
probe_model_id = "Saisam/gpt-neo-math-small"
probe_cfg = AutoConfig.from_pretrained(probe_model_id, trust_remote_code=True)
probe_cfg.attn_implementation = "sdpa"
probe_model = AutoModelForCausalLM.from_pretrained(
    probe_model_id,
    config=probe_cfg,
    trust_remote_code=True,
)

# Move the model to the defined device (GPU)
probe_model.to(device)

max_probe_length = probe_model.config.max_position_embeddings
print(f">>> Probe model '{probe_model_id}' loaded with max length: {max_probe_length}.")

# ----- Helper & Core Functions -----------------------------------------------
def _compute_cmdiv_core(
    model_id1: str,
    model_id2: str,
    dataset_fingerprint: str,
    batch_size: int
) -> float:
    """
    Computes cmdiv by averaging embedding distances over batches.
    The `align_fn` is called on each pair of flattened embeddings
    and should return the two tensors to compare.
    """
    # print(f"\n>>> Computing cmdiv for pair: ({model_id1}, {model_id2})")
    loader = DataLoader(raw_ds, batch_size=batch_size)
    distances = []

    for i, batch in enumerate(loader, start=1):
        # print(f"    ... processing batch {i}/{len(loader)}")
        batch_ds = Dataset.from_dict(batch)

        emb1, _ = embed_with_solver_tokenizer(
            model_id1, batch_ds, probe_model, max_probe_length
        )
        emb2, _ = embed_with_solver_tokenizer(
            model_id2, batch_ds, probe_model, max_probe_length
        )

        def flatten_and_pad(a: torch.Tensor, b: torch.Tensor):
            fa = a.flatten()
            fb = b.flatten()
            max_len = max(fa.numel(), fb.numel())
            if fa.numel() < max_len:
                pad = torch.zeros(max_len - fa.numel(), device=fa.device, dtype=fa.dtype)
                fa = torch.cat([fa, pad], dim=0)
            if fb.numel() < max_len:
                pad = torch.zeros(max_len - fb.numel(), device=fb.device, dtype=fb.dtype)
                fb = torch.cat([fb, pad], dim=0)
            return fa, fb

        # replace DTW block with:
        fa, fb = flatten_and_pad(emb1, emb2)
        cos_sim = F.cosine_similarity(fa.unsqueeze(0), fb.unsqueeze(0), dim=1).item()
        # Normalize to [0,1]:
        sim = (cos_sim + 1.0) / 2.0
        dist = 1.0 - sim
        distances.append(dist)


    avg_distance = sum(distances) / len(distances) if distances else 0.0
    # print(f">>> Average distance (cmdiv) for pair = {avg_distance:.4f}")
    return avg_distance

@lru_cache(maxsize=None)
def compute_cmdiv(
    model_id1: str,
    model_id2: str,
    dataset_fingerprint: str,
    batch_size: int
) -> float:
    # Canonicalize model ID order
    m1, m2 = sorted([model_id1, model_id2])

    # Now use m1 and m2 consistently for computation
    return _compute_cmdiv_core(m1, m2, dataset_fingerprint, batch_size)

pairwise_cmdiv = {}

def compute_edc(model_ids: set, dataset: Dataset, batch_size: int) -> float:
    """
    Computes the Ensemble Diversity Coefficient (EDC) for a set of models.
    """
    if len(model_ids) < 2:
        return 0.0

    dataset_fingerprint = dataset.info.builder_name
    pairwise_diversities = []

    for m_id1, m_id2 in combinations(list(model_ids), 2):
        diversity = compute_cmdiv(m_id1, m_id2, dataset_fingerprint, batch_size)
        pairwise_cmdiv[(m_id1, m_id2)] = diversity
        print(f"Diversity between {m_id1} and {m_id2} on {dataset_fingerprint} with batch size {batch_size}: {diversity}")
        pairwise_diversities.append(diversity)

    return sum(pairwise_diversities) / len(pairwise_diversities) if pairwise_diversities else 0.0

>>> Loading and preparing dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


>>> Dataset ready.
--- Using device: cuda ---
>>> Loading probe model...
>>> Probe model 'Saisam/gpt-neo-math-small' loaded with max length: 2048.


In [8]:
from itertools import combinations
from typing import List, Tuple
from tqdm import tqdm

test_model_ids = [
    'AI-MO/Kimina-Prover-Preview-Distill-7B', 'ByteDance-Seed/BFS-Prover', 'Goedel-LM/Goedel-Prover-SFT', 'deepseek-ai/DeepSeek-Prover-V1', 'deepseek-ai/DeepSeek-Prover-V1.5-RL', 'deepseek-ai/DeepSeek-Prover-V2-7B', 'kfdong/STP_model_Lean', 'stoney0062/Leanabell-Prover-DS-SFT', 'wellecks/llmstep-mathlib4-pythia2.8b'
]

def get_all_combinations_of_length(s, length: int) -> List[Tuple]:
    return list(combinations(s, length))

solver_model_ensembles_3 = get_all_combinations_of_length(test_model_ids, 3)

batch_size = 128

edc_i = []
original_len = len(edc_i)

edc_i += [0] * (len(solver_model_ensembles_3) - len(edc_i))

for i, ensemble in tqdm(enumerate(solver_model_ensembles_3), total=len(solver_model_ensembles_3)):
    if edc_i[i]:
        continue
    edc_i[i] = compute_edc(ensemble, raw_ds, batch_size)
    with open(f"edc_i_{i}.txt", "w") as file:
        file.write(",".join([str(j) for j in edc_i]))

  0%|          | 0/84 [00:00<?, ?it/s]The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 4.9388837814331055 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 3.6737613677978516 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=3.6737613677978516 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 3.398451805114746 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 2.8933053016662598 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.8933053016662598 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 2.6547961235046387 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 2.527097702026367 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.527097702026367 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 2.278129816055298 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 2.2333247661590576 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.2333247661590576 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and ByteDance-Seed/BFS-Prover on parquet with batch size 128: 0.005868181586265564


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 2.1257376670837402 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 2.1762919425964355 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.1762919425964355 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 10.449458122253418 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 7.470125198364258 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=7.470125198364258 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.926059603691101 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.9188176393508911 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.9188176393508911 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 6.154833793640137 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 4.17793607711792 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=4.17793607711792 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and Goedel-LM/Goedel-Prover-SFT on parquet with batch size 128: 0.4999999881414796


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.9129749536514282 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.9857537746429443 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.9857537746429443 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 3.817847967147827 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 3.4508907794952393 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=3.4508907794952393 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:40<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.6992532014846802 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.6776899099349976 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.6776899099349976 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 2.697401762008667 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 2.5629255771636963 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.5629255771636963 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

  1%|          | 1/84 [29:08<40:19:21, 1748.93s/it]

Diversity between ByteDance-Seed/BFS-Prover and Goedel-LM/Goedel-Prover-SFT on parquet with batch size 128: 0.4999999831036992
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and ByteDance-Seed/BFS-Prover on parquet with batch size 128: 0.005868181586265564


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.717208981513977 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.8251153230667114 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.8251153230667114 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 2.4434590339660645 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 2.6533453464508057 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.6533453464508057 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.543260097503662 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.491921305656433 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.491921305656433 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 2.1663646697998047 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 2.153378963470459 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.153378963470459 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:40<?, ?it/s]

Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V1 on parquet with batch size 128: 0.4999999847412919


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.5653027296066284 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.69529128074646 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.69529128074646 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 2.0608060359954834 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 2.265439033508301 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.265439033508301 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.411779522895813 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.34615957736969 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.34615957736969 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.9054275751113892 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.926933765411377 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.926933765411377 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:40<?, ?it/s]

  2%|▏         | 2/84 [48:26<31:54:39, 1400.96s/it]

Diversity between ByteDance-Seed/BFS-Prover and deepseek-ai/DeepSeek-Prover-V1 on parquet with batch size 128: 0.49999999134323403
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and ByteDance-Seed/BFS-Prover on parquet with batch size 128: 0.005868181586265564


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.4468777179718018 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.5975803136825562 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.5975803136825562 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.8613932132720947 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 2.035451889038086 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=2.035451889038086 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.3071014881134033 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.2431319952011108 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.2431319952011108 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.772640347480774 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.7992982864379883 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.7992982864379883 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:40<?, ?it/s]

Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and deepseek-ai/DeepSeek-Prover-V1.5-RL on parquet with batch size 128: 0.49999998187983463


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.3576395511627197 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.5310133695602417 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.5310133695602417 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.73804771900177 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.8916679620742798 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.8916679620742798 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:40<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.2319092750549316 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.1709685325622559 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.1709685325622559 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.6693364381790161 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.7008124589920044 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.7008124589920044 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/39 [00:40<?, ?it/s]

  4%|▎         | 3/84 [1:07:43<29:01:12, 1289.79s/it]

Diversity between ByteDance-Seed/BFS-Prover and deepseek-ai/DeepSeek-Prover-V1.5-RL on parquet with batch size 128: 0.49999995018216303
Diversity between AI-MO/Kimina-Prover-Preview-Distill-7B and ByteDance-Seed/BFS-Prover on parquet with batch size 128: 0.005868181586265564


Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.2958920001983643 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.4882732629776 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.4882732629776 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map (num_proc=12):   0%|          | 0/128 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/43 [00:00<?, ?it/s]


Initial loss 1.6497650146484375 (step=0 epoch=0)

final loss step=42 epoch=0 of final layer loss 1.7828315496444702 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.7828315496444702 (after fine tune, if not done it will be None)


Iteration:   0%|          | 0/43 [00:00<?, ?it/s]

Map (num_proc=12):   0%|          | 0/116 [00:00<?, ? examples/s]

self.classifier_opts={}
MODEL DEVICE:  cuda:0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]


Initial loss 1.180362582206726 (step=0 epoch=0)

final loss step=38 epoch=0 of final layer loss 1.121211051940918 (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)
loss=1.121211051940918 (after fine tune, if not done it will be None)


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7dbfd1141c60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1628, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.12/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/connection.py", line 1136, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
    

RuntimeError: DataLoader worker (pid(s) 33305, 33306, 33307, 33308, 33309, 33310, 33311, 33312) exited unexpectedly

In [9]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [10]:
with open(f"edc_i.txt", "w") as file:
        file.write(",".join([str(x) for x in edc_i]))

In [11]:
import json

data_str_keys = {str(k): v for k, v in pairwise_cmdiv.items()}

with open("output_pair_cmdiv.json", "w") as file:
    json.dump(data_str_keys, file, indent=4)

In [13]:
import json
import torch  # or tensorflow, doesn’t matter if your tensors have .tolist()

# assuming: embeddings = {"word1": tensor(...), "word2": tensor(...)}

serializable = {k: v.tolist() for k, v in embeddings.items()}

with open("output_embeddings.json", "w") as f:
    json.dump(serializable, f, indent=4)


In [14]:
# Download
from google.colab import files
files.download('output_pair_cmdiv.json')
files.download('output_embeddings.json')
files.download('edc_i.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>