# Extract Data

In [None]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from os.path import exists

def infer_column_types(df):
    for col in df.columns:
        try:
            unique_values = df[col].dropna().unique()
        except:
            df[col] = df[col].apply(lambda x: json.dumps(x))
            unique_values = df[col].dropna().unique()
        
        if set(unique_values).issubset({"True", "False", "0", "1"}):
            df[col] = df[col].map(lambda x: True if x in ["True", "1"] else False).astype("bool")
        elif np.all(~pd.isna(pd.to_numeric(unique_values, errors="coerce"))): 
            df[col] = pd.to_numeric(df[col], errors="coerce", downcast="integer")
        elif df[col].nunique() / len(df) < 0.1:
            df[col] = df[col].astype("string").astype("category")

os.makedirs(f"helm_tables", exist_ok=True)
BENCHMARKS = ["classic", "thaiexam", "mmlu", "lite"]
lo = lambda x: json.load(open(x, "r"))

task2metric = lo("task2metric.json")
task2metric = pd.json_normalize(task2metric)

all_paths = []
for benchmark in BENCHMARKS:
    dir_path = f"helm_jsons/{benchmark}/releases"
    assert exists(dir_path)
    latest_release = sorted(os.listdir(dir_path))[-1]
    folder_dict = lo(f"{dir_path}/{latest_release}/runs_to_run_suites.json")
    all_paths += [f"helm_jsons/{benchmark}/runs/{s}/{r}" for r, s in folder_dict.items()]

files = ["display_requests.json", "display_predictions.json", "run_spec.json"]
all_paths = [p for p in tqdm(all_paths) if all([exists(f"{p}/{f}") for f in files])]
all_lists = [[lo(f"{p}/{f}") for p in tqdm(all_paths)] for f in files]

results = []
for d_requests, d_predictions, run_specs, paths in tqdm(zip(*all_lists, all_paths), total=len(all_lists[0])):
    d_requests = pd.json_normalize(d_requests)
    d_predictions = pd.json_normalize(d_predictions)
    run_specs = pd.json_normalize(run_specs)
    benchmark = paths.split("/")[1]
    run_specs["benchmark"] = benchmark
    run_specs = run_specs.loc[run_specs.index.repeat(d_predictions.shape[0])].reset_index(drop=True)
    overlap_column = d_predictions.columns.intersection(d_requests.columns)
    d_requests = d_requests.drop(columns=overlap_column)
    result = pd.concat([d_requests, d_predictions, run_specs], axis=1)
    result["scenario"] = result['name'].str.split(r'[:,]', n=1, expand=True)[0]
    result["scenario"] = result["scenario"].astype("category")
    assert result["scenario"].nunique() == 1
    metric_name = task2metric[f"{benchmark}.{result['scenario'].iloc[0]}"].iloc[0]
    if isinstance(metric_name, list):
        for metric_name_ in metric_name:
            dicho_score = result.get(f"stats.{metric_name_}", pd.NA)
            if dicho_score is not pd.NA:
                if not dicho_score.isna().all():
                    result["dicho_score"] = dicho_score
                    break
    else:
        result["dicho_score"] = result.get(f"stats.{metric_name}", pd.NA)
    
    result = result[["request.model", "request.prompt", "scenario", "dicho_score"]]
    results.append(result)

results = pd.concat(results, axis=0, join='outer')
print("finished create results dataframe")
infer_column_types(results)
results.reset_index(drop=True, inplace=True)
for col in results.columns:
    if results[col].dtype != "category" and np.isnan(results[col]).all():
        results = results.drop(columns=col)
    else:
        if results[col].dtype == "float64" and np.nanmax(results[col]) < 65500 and np.nanmin(results[col]) > -65500:
            results[col] = results[col].astype("float16")
print("Started saving results")
results.to_pickle("helm_tables/responses.pkl")


In [None]:
results["request.model"] = results["request.model"].astype("category")
results["request.prompt"] = results["request.prompt"].astype("category")
results["scenario"] = results["scenario"].astype("category")
results["dicho_score"] = results["dicho_score"].astype("float16")

In [None]:
results[dup_keys]

In [8]:
dup_keys = results.duplicated(keep=False)
print(dup_keys.sum())


6512249


In [41]:
results[dup_keys].iloc[0, 0]

'AlephAlpha/luminous-base'

In [45]:
df = results[results["request.prompt"] == results[dup_keys].iloc[0, 1]]
df.head()

Unnamed: 0,request.model,request.prompt,scenario,dicho_score
0,AlephAlpha/luminous-base,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,1.0
610,AlephAlpha/luminous-base,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,1.0
0,AlephAlpha/luminous-extended,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,0.0
610,AlephAlpha/luminous-extended,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,0.0
0,AlephAlpha/luminous-supreme,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,0.0


In [46]:
df = df[df["request.model"] == results[dup_keys].iloc[0, 0]]

In [47]:
df.head()

Unnamed: 0,request.model,request.prompt,scenario,dicho_score
0,AlephAlpha/luminous-base,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,1.0
610,AlephAlpha/luminous-base,Passage: Mice are afraid of cats.\nSheep are a...,babi_qa,1.0


In [49]:
from pprint import pprint
pprint(results[dup_keys].iloc[0, 1])

('Passage: Mice are afraid of cats.\n'
 'Sheep are afraid of cats.\n'
 'Emily is a sheep.\n'
 'Winona is a mouse.\n'
 'Wolves are afraid of cats.\n'
 'Gertrude is a wolf.\n'
 'Jessica is a wolf.\n'
 'Cats are afraid of mice.\n'
 'Question: What is winona afraid of?\n'
 'Answer: cat\n'
 '\n'
 'Passage: Wolves are afraid of sheep.\n'
 'Sheep are afraid of mice.\n'
 'Winona is a wolf.\n'
 'Cats are afraid of mice.\n'
 'Mice are afraid of sheep.\n'
 'Jessica is a sheep.\n'
 'Emily is a cat.\n'
 'Gertrude is a cat.\n'
 'Question: What is gertrude afraid of?\n'
 'Answer: mouse\n'
 '\n'
 'Passage: Wolves are afraid of cats.\n'
 'Cats are afraid of sheep.\n'
 'Mice are afraid of cats.\n'
 'Gertrude is a cat.\n'
 'Winona is a mouse.\n'
 'Sheep are afraid of cats.\n'
 'Jessica is a wolf.\n'
 'Emily is a cat.\n'
 'Question: What is emily afraid of?\n'
 'Answer: sheep\n'
 '\n'
 'Passage: Wolves are afraid of mice.\n'
 'Cats are afraid of wolves.\n'
 'Winona is a wolf.\n'
 'Emily is a wolf.\n'
 'Sh

# Compute Perplexity

In [None]:
from vllm import LLM, SamplingParams
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import os


def find_common_prefix(strings):
    """Find the longest common prefix shared by all strings in a group."""
    if not strings: return ""
    common_prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(common_prefix) and common_prefix:
            common_prefix = common_prefix[:-1]
    return common_prefix

def detect_subgroups(strings, threshold=0.6):
    """
    Group strings such that each subgroup's common prefix (updated as strings are added)
    has a length that is at least `threshold` fraction of the shorter string compared.
    """
    if not strings: return []
    
    sorted_strings = sorted(strings)
    clusters = []
    current_cluster = [sorted_strings[0]]
    current_prefix = sorted_strings[0]

    for s in sorted_strings[1:]:
        new_common_prefix = find_common_prefix([current_prefix, s])
        min_length = min(len(current_prefix), len(s))
        ratio = len(new_common_prefix) / min_length if min_length > 0 else 0
        
        if ratio >= threshold:
            current_cluster.append(s)
            current_prefix = new_common_prefix
        else:
            clusters.append(current_cluster)
            current_cluster = [s]
            current_prefix = s
    clusters.append(current_cluster)
    return clusters

def create_batches(prompts, token_lengths, max_tokens_per_batch):
    """
    Create batches such that each batch contains at most max_batch_size prompts
    and the total token count in the batch does not exceed max_tokens_per_batch.
    Prompts that individually exceed max_tokens_per_batch are skipped.
    """
    batches = []
    current_batch = []
    current_tokens = 0

    for prompt, token_count in tqdm(zip(prompts, token_lengths), total=len(prompts)):
        if current_batch and current_tokens + token_count > max_tokens_per_batch:
            batches.append(current_batch)
            current_batch = []
            current_tokens = 0

        current_batch.append(prompt)
        current_tokens += token_count

    if current_batch: batches.append(current_batch)
    return batches

model_name = "meta-llama/Llama-3.1-8B-Instruct"
model_nickname = model_name.split("/")[-1]
checkpoint = "results_perplexity_forthattempt.pkl"
max_token_length = 2048
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 in code
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
vllm_model = LLM(
    model_name,
    gpu_memory_utilization=0.9,
    enable_chunked_prefill=False,
    enforce_eager=True,
    dtype=torch.float16,
    swap_space=32,
    max_num_seqs=128,
    tensor_parallel_size=len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")),
)
tokenizer = vllm_model.get_tokenizer()
sampling_params = SamplingParams(n=1, temperature=1, max_tokens=1, prompt_logprobs=0)

if os.path.exists(checkpoint):
    results = pd.read_pickle(checkpoint)
    batch_start = int(open(checkpoint + ".batch_count").read())
else:
    results = pd.read_pickle("../gather_helm_data/helm_tables/responses.pkl")
    for group in tqdm(results["groups"].unique()):
        # Preserve order while removing duplicates
        example_strings = results.loc[results["groups"] == group, "request.prompt"].to_list()
        unique_example_strings = []
        seen = set()
        for s in example_strings:
            if s not in seen:
                unique_example_strings.append(s)
                seen.add(s)
        
        subgroups = detect_subgroups(unique_example_strings, threshold=0.8)
        mapping = {}
        mapping_i = {}
        for subgroup in subgroups:
            if len(subgroup) == 1:
                # Singleton subgroup: no common prefix exists, so don't strip anything.
                s = subgroup[0]
                mapping[s] = s
                mapping_i[s] = 0
            else:
                common_prefix = find_common_prefix(subgroup)
                for s in subgroup:
                    mapping[s] = s[len(common_prefix):].strip()
                    mapping_i[s] = len(common_prefix)
        
        # Create a boolean mask for the current group and assign mapped values
        mask = results["groups"] == group
        results.loc[mask, "question_start_index"] = results.loc[mask, "request.prompt"].map(mapping_i)
    
    results[f"perplexity_{model_nickname}"] = np.nan
    results["question_start_index"] = results["question_start_index"].astype(int)
    batch_start = 0

    unique_prompts = list(results["request.prompt"].unique())
    encoded = tokenizer(unique_prompts, add_special_tokens=False)
    lengths = [len(ids) for ids in encoded["input_ids"]]
    unique_prompt_to_length = dict(zip(unique_prompts, lengths))
    results["token_length"] = results["request.prompt"].map(unique_prompt_to_length)

    # Precompute the token index corresponding to the question portion.
    # This is computed as the number of tokens in the substring up to the question_start_index.
    unique_mapping = results[["request.prompt", "question_start_index"]].drop_duplicates(subset="request.prompt")
    prompt_to_q_index = dict(zip(unique_mapping["request.prompt"], unique_mapping["question_start_index"]))

    # For each unique prompt, compute its token index (number of tokens in prompt[:question_start_index])
    question_token_indices = { 
        prompt: len(tokenizer.encode(prompt[:q_index])) 
        for prompt, q_index in prompt_to_q_index.items() 
    }
    results["question_token_index"] = results["request.prompt"].map(question_token_indices)

    results.to_pickle(checkpoint)
    with open(checkpoint + ".batch_count", "w") as f:
        f.write(str(batch_start))

results = results.dropna(subset=["dicho_score"])
filtered = results.loc[results["token_length"] < 2048, ["request.prompt", "token_length"]]
filtered = filtered.drop_duplicates(subset="request.prompt")
filtered = filtered.sort_values("token_length", ascending=False)
unique_prompts = filtered["request.prompt"].tolist()
unique_token_lengths = filtered["token_length"].tolist()

print("Start creating batches")
batches = create_batches(unique_prompts, unique_token_lengths, max_token_length)
existing = results.loc[pd.notna(results[f"perplexity_{model_nickname}"]), ["request.prompt", f"perplexity_{model_nickname}"]]
prompt_to_perp = dict(zip(existing["request.prompt"], existing[f"perplexity_{model_nickname}"]))

unique_mapping = results[["request.prompt", "question_token_index"]].drop_duplicates(subset="request.prompt")
question_token_indices = dict(zip(unique_mapping["request.prompt"], unique_mapping["question_token_index"]))

print(f"fraction of filtered prompts: {len(unique_prompts) / len(unique_mapping):.2%}")

skip_count = 0
for i, batch_prompts in tqdm(enumerate(batches[batch_start:]), total=len(batches[batch_start:])):
    batch_outputs = vllm_model.generate(batch_prompts, sampling_params, use_tqdm=False)
    for prompt, output in zip(batch_prompts, batch_outputs):
        token_index = question_token_indices[prompt] #1
        logprobs_list = output.prompt_logprobs[token_index:]
        target_logprobs = [list(logprobs.values())[0].logprob for logprobs in logprobs_list]
        if target_logprobs: prompt_to_perp[prompt] = np.exp(-np.mean(target_logprobs))
        else: skip_count += 1

    if (i > 0 and i % 10000 == 0) or i == len(batches) - 1:
        print(f"progress {i} / {len(batches)}, skipped {skip_count} items so far")
        results[f"perplexity_{model_nickname}"] = results["request.prompt"].map(prompt_to_perp).astype(float)
        results.to_pickle(checkpoint)
        with open(checkpoint + ".batch_count", "w") as f:
            f.write(str(i))

# Compute Embeddings

In [None]:
import pandas as pd
from vllm import LLM
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

# List of GPU ids to use (adjust based on your available GPUs)
gpu_ids = [0,1,2,4,6,7]  # This gives you 5 models (update if needed)
num_models = len(gpu_ids)
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
model_name = "Alibaba-NLP/gte-Qwen2-7B-instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"


def embed_sub_batch(gpu_id, sub_batch):
    # Reinitialize the model in this process on the specified GPU.
    llm = LLM(
        model=model_name,
        task="embed", 
        gpu_memory_utilization=0.9,
        # enable_chunked_prefill=False,
        # enforce_eager=True,
        tensor_parallel_size=1,
        # tokenizer_pool_size=8,
        device=f"cuda:{gpu_id}",
    )
    outputs = llm.embed(sub_batch)
    # Extract and return the embedding from each output object.
    return [o.outputs.embedding for o in outputs]

results = pd.read_pickle("results_perplexity_forthattempt.pkl")
results = results.dropna(subset=["dicho_score"])
filtered = results.loc[results["token_length"] < 2048, ["request.prompt", "token_length"]]
filtered = filtered.drop_duplicates(subset="request.prompt")
filtered = filtered.sort_values("token_length", ascending=False)
unique_prompts = filtered["request.prompt"].tolist()

outputs = []
batch_size = len(unique_prompts)

for i in tqdm(range(0, len(unique_prompts), batch_size)):
    batch = unique_prompts[i : i + batch_size]
    # Split the batch into sub-batches for each GPU.
    # This slicing method works even if len(batch) isn’t divisible by num_models.
    sub_batches = [batch[j::num_models] for j in range(num_models)]
    
    futures = []
    with ProcessPoolExecutor(max_workers=num_models) as executor:
        for gpu_id, sub_batch in zip(gpu_ids, sub_batches):
            if sub_batch:  # Only submit if there’s work.
                futures.append(executor.submit(embed_sub_batch, gpu_id, sub_batch))
        for future in as_completed(futures):
            outputs.extend(future.result())
    
    # Optionally, save partial results to disk.
    question_embedding = pd.DataFrame({
        "question": unique_prompts[:len(outputs)],
        "embedding": outputs
    })
    # question_embedding.to_pickle(f"unique_prompts_embeddings_Mistral-7B-Instruct-v0.3.pkl")
    question_embedding.to_pickle(f"unique_prompts_embeddings_gte-Qwen2-7B-instruct.pkl")