In [None]:
import pandas as pd
val  = pd.read_csv('data/val.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

data = pd.concat([train, val, test], ignore_index=True)

data = data.groupby(['prompt_id','prompt']).count().reset_index()

data = data[['prompt_id', 'prompt']].copy()

# sentence transformer

In [23]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)
batch_size = 256

# Create output directory
os.makedirs("embeddings", exist_ok=True)

# Prepare dict to hold {prompt_id: embedding}
id_to_embedding = {}

# Batch encode
for i in tqdm(range(0, len(data), batch_size)):
    batch_df = data.iloc[i:i+batch_size]
    prompts = batch_df["prompt"].tolist()
    ids = batch_df["prompt_id"].tolist()
    embeddings = model.encode(prompts, convert_to_numpy=True).tolist()
    
    for pid, emb in zip(ids, embeddings):
        id_to_embedding[str(pid)] = emb  # str() ensures JSON-safe keys

# Save to JSON file
output_path = os.path.join("embeddings", "prompt_id_to_embedding_sentence-transformer.json")
with open(output_path, "w") as f:
    json.dump(id_to_embedding, f)

print(f"Saved {len(id_to_embedding)} embeddings to {output_path}")


100%|██████████| 140/140 [00:44<00:00,  3.13it/s]


Saved 35673 embeddings to embeddings/prompt_id_to_embedding_sentence-transformer.json


In [7]:
import json
test = json.load(open("embeddings/sentence-transformer.json", "r"))

len(test)

35673

In [8]:
len(test['0'])

768

# modernbert

In [4]:
import os, json, torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# Perf knobs
torch.set_float32_matmul_precision("high")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModel.from_pretrained(model_name).to(device).eval()

os.makedirs("embeddings", exist_ok=True)
id_to_embedding = {}

# assume `data` is a dict: {"prompt_id": [...], "prompt": [...]}
n = len(data["prompt"])
with torch.inference_mode():
    for prompt_id, prompt in tqdm(
        zip(data["prompt_id"], data["prompt"]),
        total=n, desc="Encoding with ModernBERT [CLS]"
    ):
        enc = tokenizer(prompt, return_tensors="pt", truncation=True)
        enc = {k: v.to(device) for k, v in enc.items()}
        out = model(**enc)
        cls = out.last_hidden_state[:, 0, :]          # <s>/[CLS] position
        id_to_embedding[str(prompt_id)] = cls.squeeze(0).cpu().tolist()

out_path = os.path.join("embeddings", "modern-bert.json")
with open(out_path, "w") as f:
    json.dump(id_to_embedding, f)
print(f"Saved {len(id_to_embedding)} embeddings to {out_path}")


Encoding with ModernBERT [CLS]: 100%|██████████| 35673/35673 [05:29<00:00, 108.24it/s]


Saved 35673 embeddings to embeddings/modern-bert.json


In [5]:
import json
test = json.load(open("embeddings/modern-bert.json", "r"))

len(test)

35673

In [6]:
len(test['0'])

1024