In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tueplots import bundles
from tqdm import tqdm
bundles.icml2024()
import random
import numpy as np
import matplotlib.colors as mcolors

def majority_vote(series):
    # Filter out -1 values (no value)
    valid = series[series != -1]
    # If there are no valid values, return -1 (or handle as needed)
    if len(valid) == 0:
        return -1
    # Count the number of 0's and 1's
    count0 = (valid == 0).sum()
    count1 = (valid == 1).sum()
    # Return the majority; break ties arbitrarily
    if count1 > count0:
        return 1
    elif count0 > count1:
        return 0
    else:
        return random.choice([0, 1])

# results = pd.read_pickle("../gather_helm_data/helm_tables/responses.pkl")
# results_full = pd.read_pickle("results_perplexity.pkl")
# results_full = pd.read_pickle("results_perplexity_thirdattempt.pkl")
results_full = pd.read_pickle("results_perplexity_forthattempt.pkl")
results = results_full[["request.model", "instance_id", "dicho_score", "groups"]]
results = results.dropna(subset=["dicho_score"])
results["dicho_score"] = results["dicho_score"].apply(
    lambda x: x.item() if hasattr(x, "item") else x
) # covert numpy.float16 to float
results = results.pivot_table(index="request.model", columns=["instance_id", "groups"], values="dicho_score", aggfunc=majority_vote)

print("done pivoting")

# sort the columns by groups
results = results.sort_index(axis=1, level="groups")

results = results.loc[:, (results != 0).any()]
results = results.loc[:, (results != 1).any()]
results = results.fillna(-1).astype(int)
# Replace -1 with NaN so that missing scores are ignored
results = results.replace(-1, np.nan)

# Compute the overall average for each group manually
group_means = {}
for group in results.columns.get_level_values("groups").unique():
    mask = results.columns.get_level_values("groups") == group
    values = results.loc[:, mask].values  # all values for this group
    group_means[group] = np.nanmean(values)

# Sort the groups by their average score
sorted_groups = sorted(group_means, key=group_means.get)

# Create a mapping from group to its sort order
group_order = {group: order for order, group in enumerate(sorted_groups)}

# Reorder the columns based on the new group order using the key parameter
results = results.sort_index(axis=1, level="groups", key=lambda x: x.map(group_order))

# Compute the overall average for each row (ignoring NaNs)
row_means = results.mean(axis=1)

# Sort the rows by these computed averages (lowest to highest)
results = results.loc[row_means.sort_values().index]

# convert nan back to -1
results = results.replace(np.nan, -1)
# count the fraction of -1 
print((results == -1).sum().sum() / (results.shape[0] * results.shape[1]))

Matplotlib created a temporary cache directory at /tmp/user/21130/matplotlib-vb4dkkrr because the default path (/afs/cs.stanford.edu/u/sttruong/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.
  results = results.pivot_table(index="request.model", columns=["instance_id", "groups"], values="dicho_score", aggfunc=majority_vote)


done pivoting
0.6929338169796397


In [8]:
import pandas as pd
from vllm import LLM
import torch
from tqdm import tqdm

llm = LLM(
    model="meta-llama/Llama-3.1-8B-Instruct",
    task="embed", 
    gpu_memory_utilization=0.9,
    enable_chunked_prefill=False,
    enforce_eager=True,
    # dtype=torch.float16,
    # swap_space=32,
    # max_num_seqs=128,
    tensor_parallel_size=1,
    tokenizer_pool_size=8,
)
print("finished loading model")
results = pd.read_pickle("results_perplexity_forthattempt.pkl")
results = results.dropna(subset=["dicho_score"])
filtered = results.loc[results["token_length"] < 2048, ["request.prompt", "token_length"]]
filtered = filtered.drop_duplicates(subset="request.prompt")
filtered = filtered.sort_values("token_length", ascending=False)
unique_prompts = filtered["request.prompt"].tolist()

# process the prompt in batch
outputs = []
batch_size = 128 # 20480
# for i in tqdm(range(0, len(unique_prompts), batch_size)):
for i in tqdm(range(0, batch_size, batch_size)):
    batch = unique_prompts[i:i+batch_size]
    output = llm.embed(batch, use_tqdm=False)
    outputs.extend([o.outputs.embedding for o in output])


INFO 03-05 10:47:23 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, dis

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 03-05 10:47:30 model_runner.py:1115] Loading model weights took 14.9576 GB
finished loading model


100%|██████████| 1/1 [00:22<00:00, 22.03s/it]


In [10]:
len(outputs)

128

In [11]:
(i+1)*batch_size

128

In [12]:
print(f"Processed {i+len(batch)} out of {len(unique_prompts)} prompts.") 
question_embedding = pd.DataFrame({"question": unique_prompts[:(i+1)*batch_size], "embedding": outputs})
question_embedding.to_pickle("unique_prompts_embeddings.pkl")


Processed 128 out of 1023604 prompts.
