In [1]:
# enable reloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import csv
import numpy as np
from sklearn.metrics import roc_auc_score

from embeddings_utils.utils import load_with_conditions
from embeddings_utils.classification_utils import brierDecomp, arc_points, compute_winklers_score
from generate_train_test_split import load_open_llm_v2

from typing import List, Tuple, Callable

In [3]:
short_labels_predictive_method = {
    'logistic_regression_l2': 'Logistic Regression (l2)',
    'logistic_regression_l1_c=1': 'Logistic Regression (l1)',
    'logistic_regression_l1_c=0.1': 'Logistic Regression (l1, c=0.1)',
    'xgboost': 'XGBoost',
}

short_labels_features = {
    'openai': 'OpenAI',
    'word2vec': 'Word2Vec',
    'fasttext': 'FastText',
    'ngrams_1': 'Ngrams-1',
    'fineTunedLlama': 'FT-Llama'
}

# Run configuration

In [4]:
filename = os.path.join("results", "mmlu_pro_assessor_results.pkl")

# Learn assessors for all LLMs in the MMLU Pro dataset
mmlu_pro_results_file_location = os.path.join("..","..","data", "open-llm-leaderboard-v2", "mmlu_pro_results.csv")
# Some model are duplicated, so use set() to dedupe
llms = list(set(pd.read_csv(mmlu_pro_results_file_location)["model"]))
train_dataset_name = "mmlu_pro"
test_dataset_name = "mmlu_pro" # In distribution assessment
PVR_thresholds = [0.8, 0.9, 0.95]

## Load results


In [5]:
# Load train/test/val split data
train_df, validation_df, test_df = load_open_llm_v2(llms, train_dataset_name, test_dataset_name, exclude_embeddings=True)

In [None]:
# Load results generated by train_embeddings_assessors.ipynb
assessors_results_df = load_with_conditions(filename)
# Remove l1_c=0.1 assessors, as the regularization seems to be too much (they normally become a constant assessor)
assessors_results_df = assessors_results_df[assessors_results_df["predictive_method"] != "logistic_regression_l1_c=0.1"]
print(f"Loaded number considered LLMs: {len(set(assessors_results_df['llm']))}")
print(f"Loaded number of LLM-assessor pairs: {len(assessors_results_df)}")

In [7]:
# Load results of using a fine tuned Llama 1B to predict
# Excluded for now as Since we're trying to provide a benchmark, and LLama was getting Winkler's scores around -2.2 (while the others were getting ~+/- 0.3)
# LLAMA_PREDICTIVE_METHOD = LLAMA_FEATURES = "FT-Llama"

# llama_assessor_results_loc = os.path.join("..", "..", "results")
# files = [item for item in os.listdir(llama_assessor_results_loc) if item[-13:] == "_mmlu_pro.csv"]
# llms_assessed = [file[:-35] for file in files]
# for file in files:
#     # Add a row to the results df for the assessed LLM
#     with open(os.path.join(llama_assessor_results_loc, file)) as csvfile:
#         fine_tuned_llama_results_csv = csv.reader(csvfile)
#         fine_tuned_llama_results_df = pd.DataFrame(fine_tuned_llama_results_csv, columns=next(fine_tuned_llama_results_csv))
#         # Double check that the results df only contains the test items
#         assert set(test_df["question_id"]) == set(fine_tuned_llama_results_df["id"])
#         y_pred = fine_tuned_llama_results_df["assessor_correct"].astype(float)
#         labels = fine_tuned_llama_results_df["llm_correct"].astype(float)
#         llm_accuracy_test = labels.mean()
#         # # TODO: Assuming assessor_correct is the proportion of times the assessor predicts that the llm will be correct
#         BrierScore, Calibration, Refinement = brierDecomp(
#             y_pred, labels
#         )
#         win_score = compute_winklers_score(labels, y_pred)
#         roc_auc = roc_auc_score(labels, y_pred)
#         arc = arc_points(labels, y_pred)
#         llm_row = pd.DataFrame({'predictive_method': LLAMA_PREDICTIVE_METHOD, 'features': LLAMA_FEATURES, 'llm': file[:-35], 'BrierScore_val': np.nan,
#         'Calibration_val': np.nan, 'Refinement_val': np.nan, 'AUROC_val': np.nan, 'WinklerScore_val': np.nan, 'BrierScore_test': BrierScore,
#         'Calibration_test': Calibration, 'Refinement_test': Refinement, 'AUROC_test': roc_auc, 'WinklerScore_test': win_score,
#         'predictions_train': None, 'predictions_val': None, 'predictions_test': None, 'arc_train': None,
#         'arc_test': [arc], 'arc_val': None, 'trained_classifier': None, 'llm_accuracy_train': np.nan, 'llm_accuracy_val': np.nan, 'llm_accuracy_test': llm_accuracy_test,})
#         assessors_results_df = pd.concat([assessors_results_df, llm_row], ignore_index=True)

In [8]:
# replace the names with abbreviated versions
assessors_results_df["predictive_method"] = assessors_results_df["predictive_method"].replace(short_labels_predictive_method)
assessors_results_df["features"] = assessors_results_df["features"].replace(short_labels_features)

# now sort them so that OAI, W2V, FT, NG1 are together
# Define a custom order for embeddings
feature_order = ['openai', 'word2vec', 'fasttext', 'ngrams_1']

# Convert to a categorical type with the desired order
assessors_results_df["feature_order"] = pd.Categorical(assessors_results_df["features"], categories=feature_order, ordered=True)

# Now sort by feature_order, and then by predictive method
assessors_results_df = assessors_results_df.sort_values(["feature_order", "predictive_method"])

In [9]:
# Add PVR for each threshold
def get_PVR_for_threshold(threshold: float) -> Callable[[List[Tuple[float,float]]], float]:
    return lambda vals: 1 - min([rate for rate,acc in vals if acc >= threshold])
for threshold in PVR_thresholds:
    PVR_col_name = f"{threshold} PVR"
    assessors_results_df[PVR_col_name] = assessors_results_df["arc_test"].apply(get_PVR_for_threshold(threshold))

In [10]:
# Add an llm_method_features name for each row
assessors_results_df["pair_name"] = assessors_results_df.apply(lambda row: f"{row['llm'].replace('__', '/')}\n({row['predictive_method']}, {row['features']})", axis=1)
# Add a method_features name for each row
assessors_results_df["predictive_method_features"] = assessors_results_df.apply(lambda row: f"{row['predictive_method']}_{row['features']}", axis=1)

In [None]:
print(f"Total number considered LLMs: {len(set(assessors_results_df['llm']))}")
print(f"Total number of LLM-assessor pairs: {len(assessors_results_df)}")

# Comparing the effectiveness of different assessors

Firstly we compare the different methods for assessment by grouping by assessor type and making boxplots for AUROC, Brier Score, and 90% PVR. A threshold of 90% was chosen because it is still a significant challenge for assessors but we still have assessors with a relatively good score (A PVR of ~0.2). These plots demonstrate that (1) no one method is significantly better than all others and (2) very few LLM-assessor pairs are able to achieve a PVR substantially above 0.

In [None]:
side_by_side = False

boxplot_params = {
    "showmeans": True,
    "meanline": True,
    "boxprops": dict(linewidth=1),
    "whiskerprops": dict(linewidth=1),
    "capprops": dict(linewidth=1),
    "flierprops": dict(marker='o', markersize=5),
    "medianprops": dict(linewidth=1),
    "meanprops": dict(linewidth=1),
    "width": 0.5
}

if not side_by_side:
    fig, axes = plt.subplots(2, 1, figsize=(6, 12), sharex=True)
else:
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Left panel: AUROC
sns.boxplot(data=assessors_results_df, x='predictive_method', hue="features", y='AUROC_test', ax=axes[0], **boxplot_params)
axes[0].legend_.remove()  # Remove the legend from the first subplot since it gets in the way and we can see it in the second
axes[0].set_ylabel('AUROC (→)', fontsize=12)
axes[0].set_title('AUROC by Assessor Method', fontsize=14)
axes[0].tick_params(axis='x', rotation=10)
axes[0].set_xlabel("")
axes[0].axhline(y=0.5, color="r")

# Right panel: Winkler's Score
sns.boxplot(data=assessors_results_df, x='predictive_method', hue="features", y='WinklerScore_test', ax=axes[1], **boxplot_params)
axes[1].set_xlabel("")
axes[1].set_ylabel("Winkler's Score (→)", fontsize=12)
axes[1].set_title("Winkler's Score by Assessor Method", fontsize=14)
axes[1].tick_params(axis='x', rotation=10)
axes[1].axhline(y=0, color="r")

plt.tight_layout()
plt.savefig("experiments_AUROC_Winkler.pdf", format="pdf")
plt.show()


# Finding the best LLM-Assessor pairs by PVR

The main metric we are interested in for assessors is the size of the PVR, as this is the size of the region in which the LLM can operate safely. Firstly we consider the top subject-assessor pairs for PVR thresholds 0.8, 0.9 and 0.95 (With a 0.99 threshold, it drops to near zero).

This is visualised in the below heatmap, showing the union of the top 10 for each threshold.

We can see that the LLM-assessor pairs get a fairly good score at a threshold of 0.8. This is to be expected when the LLMs are fairly good at the task, as the assessor can predict success most of the time. When the threshold is raised to 0.9 we see a very large drop in PVR, as now there is a greater requirement for assessors to make predictions that the LLM will fail.

In [None]:
PLOT_TOP_NUM = 5
PVR_thresholds = [0.80, 0.90, 0.95]
# Identify the top pairs for each threshold
top_pairs_sets = []
for threshold in PVR_thresholds:
    PVR_col_name = f"{threshold} PVR"
    top_pairs_for_threshold = assessors_results_df.nlargest(PLOT_TOP_NUM, PVR_col_name)['pair_name'].unique()
    top_pairs_sets.append(set(top_pairs_for_threshold))
# Take the union of top pairs across all thresholds
all_top_pairs = set.union(*top_pairs_sets)
# Filter the dataframe to only these pairs
filtered_df = assessors_results_df[assessors_results_df['pair_name'].isin(all_top_pairs)]
# Keep only the relevant columns: pair_name and PVR columns
PVR_cols = [f"{t} PVR" for t in PVR_thresholds]
filtered_df = filtered_df[['pair_name'] + PVR_cols].drop_duplicates('pair_name')
# Set pair_name as the index
filtered_df = filtered_df.set_index('pair_name')
# Round values to 3 decimal places
filtered_df = filtered_df.round(3)
# Sort by the 0.9 PVR column
filtered_df = filtered_df.sort_values(by="0.9 PVR", ascending=False)
# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(filtered_df, annot=True, cmap='YlGnBu', fmt='.3f')
plt.ylabel("LLM-Assessor Pair Name")
plt.xlabel("Threshold PVR")
plt.tight_layout()
# plt.rcParams.update({'font.size': 14})
# plt.savefig("experiments_PPR.pdf", format="pdf")
plt.show()

# ARC curve comparison

To demonstrate how assessors can vary in usefulness, we select the highest accuracy LLM (OpenAI__GPT-4o-2024-08-06) and compare the ARC curves for the assessors with the highest, and lowest PVR at a threshold of 0.9.

In [34]:
def plot_arc_curves(pairs_to_plot: List[str], title = None) -> None:
    llm_auc = pd.DataFrame(columns=["Assessor Name", "Rejection Rate", "Accuracy"])

    for _, row in assessors_results_df[
            ["pair_name", "arc_test", "0.9 PVR"]
        ].iterrows():
        res = row["arc_test"]
        if row["pair_name"] not in pairs_to_plot:
            # Don't plot all the graphs
            continue

        llm_auc = pd.concat(
            [
                llm_auc,
                pd.DataFrame(
                    {
                        "Assessor Name": row["pair_name"],
                        "Rejection Rate": [i for i, _ in res],
                        "Accuracy": [j for _, j in res],
                    }
                ),
            ],
            ignore_index=True,
        )

    # plot using seaborn
    # plt.figure(figsize=(10, 10))
    g = sns.relplot(
        data=llm_auc, kind="line", x="Rejection Rate", y="Accuracy", hue="Assessor Name", facet_kws={'legend_out': False}
    )
    if title is not None:
        g.figure.suptitle(title, y=1.02)
    g.set(ylim=(0.5, 1))
    plt.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, .25),
        ncol=1
    )
    plt.tight_layout()
    plt.savefig("experiments_ARC_crossing_pair.pdf", format="pdf")
    plt.show()

In [32]:
# plot the ARC curves for all assessors, ie all predictive method and all features. Use different line colors across features and different line styles across predictive methods
def plot_all_arc_curves(llm):
    llm_auc = pd.DataFrame(columns=["Assessor Name", "Rejection Rate", "Accuracy", "Features", "Predictive Method"])
    assessors_results_df_llm = assessors_results_df[assessors_results_df["llm"] == llm]
    
    for _, row in assessors_results_df_llm[
            ["pair_name", "arc_test", "features", "predictive_method"]
        ].iterrows():
        res = row["arc_test"]
        llm_auc = pd.concat(
            [
                llm_auc,
                pd.DataFrame(
                    {
                        "Assessor Name": row["pair_name"],
                        "Rejection Rate": [i for i, _ in res],
                        "Accuracy": [j for _, j in res],
                        "Features": row["features"],
                        "Predictive Method": row["predictive_method"]
                    }
                ),
            ],
            ignore_index=True,
        )

    plt.figure(figsize=(10, 10))
    sns.relplot(
        data=llm_auc, kind="line", x="Rejection Rate", y="Accuracy", hue="Features", style="Predictive Method", facet_kws={'legend_out': False}
    )
    plt.legend(
        loc='upper center',
        bbox_to_anchor=(0.5, .3),
        ncol=2
    )
    plt.ylim(0.65, 1)
    plt.savefig("experiments_ARC_top_scorer.pdf", format="pdf")
    plt.show()

In [None]:
# Plot all ARC curves for the top accuracy LLM
all_results = pd.concat([train_df, validation_df, test_df], ignore_index=True)
print(f"Total instances in the dataset: {len(all_results)}")
num_samples = all_results.shape[0]
llm_columns = [col for col in all_results.columns if col.startswith('Success_')]

accuracy_per_llm = pd.DataFrame({
    # Remove "Success_model_outputs_" from the column name
    "llm": [col_name[8:] for col_name in llm_columns],
    "Accuracy": [all_results[col].sum()/num_samples for col in llm_columns]  # This works if values are boolean (True counts as 1)
})
top_accuracy_llm = accuracy_per_llm.loc[accuracy_per_llm["Accuracy"].idxmax(), "llm"]
print(f"Top accuracy LLM: {top_accuracy_llm}")

plot_all_arc_curves(top_accuracy_llm)

In [None]:
plot_arc_curves([
    "OpenAI/GPT-4o-2024-08-06\n(Logistic Regression (l1), Ngrams-1)", 
    "MaziyarPanahi/calme-2.1-qwen2.5-72b\n(Logistic Regression (l1), OpenAI)"
])

# Table/graph with LLM accuracy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example family mapping logic:
def get_family(llm_name):
    llm_name_lower = llm_name.lower()
    if "openai__gpt-" in llm_name_lower:
        return "OpenAI"
    elif "llama-3" in llm_name_lower or "meta-llama" in llm_name_lower or "nousresearch__hermes" in llm_name_lower:
        return "Llama"
    elif "mistral" in llm_name_lower:
        return "Mistral"
    elif "qwen" in llm_name_lower:
        return "Qwen"
    elif "gemma" in llm_name_lower:
        return "Gemma"
    elif "pythia" in llm_name_lower:
        return "Pythia"
    elif "intel__neural-chat" in llm_name_lower:
        return "Intel Neural Chat"
    # Add more conditions as needed to categorize your models
    else:
        return "Other"

# Create a family column based on the llm name
assessors_results_df['family'] = assessors_results_df['llm'].apply(get_family)

# Create a table with the accuracy of all LLMs
llm_accuracy_table = assessors_results_df[["llm", "llm_accuracy_test", "family"]]\
    .drop_duplicates()\
    .sort_values(by="llm_accuracy_test", ascending=False)

print(llm_accuracy_table)

# Plot a graph with the accuracy of all LLMs, using family for hue
plt.figure(figsize=(12, 6))
ax = sns.barplot(data=llm_accuracy_table, x="llm", y="llm_accuracy_test", hue="family", dodge=False)

# add the value in the bars
# Iterate over each container (one per hue category)
for container in ax.containers:
    # ax.bar_label(container, fmt='%.3f', label_type='edge')  # Choose a suitable label_type
    # make them vertical
    ax.bar_label(container, label_type='edge', rotation=90, fmt='%.3f')

plt.xlabel('LLM', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Accuracy of all LLMs', fontsize=14)
plt.xticks(rotation=90)
#plt.legend(title='Family', bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.tight_layout()
plt.savefig("llm_accuracy.pdf", format="pdf")
plt.show()
