In [1]:
# enable reloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
import os

from embeddings_utils.results_loaders import ngram_vectorize_new, select_features
from embeddings_utils.utils import load_with_conditions
from embeddings_utils.classification_utils import predictive_method_list, evaluate_and_update_arrays, evaluate_and_update
from generate_train_test_split import load_open_llm_v2

import pandas as pd

# Run configuration

In [None]:
overwrite_res = True
filename = os.path.join("results", "mmlu_pro_assessor_results.pkl")

# Learn assessors for all LLMs in the MMLU Pro dataset
mmlu_pro_results_file_location = os.path.join("..","..","data", "open-llm-leaderboard-v2", "mmlu_pro_results.csv")
# Some model are duplicated, so use set() to dedupe
llms = list(set(pd.read_csv(mmlu_pro_results_file_location)["model"]))
train_dataset_name = "mmlu_pro"
test_dataset_name = "mmlu_pro" # In distribution assessment

# Load Data

In [None]:
train_df, validation_df, test_df = load_open_llm_v2(llms, train_dataset_name, test_dataset_name)

In [None]:
assessors_results_df = load_with_conditions(filename, overwrite_res)

## Train assessors for each LLM independently

- Train an assessor for each LLM on the train_dataset, then evaluate on each of the test datasets
- We train assessors using different features (OpenAI embeddings, word2vec, fasttext and n-grams) and different base classifiers (LogisticRegression, XGBoost) in predictive_method_list. The cells below train the assessors and store the instance-level predictions (and other performance metrics) in a dataframe.
- TODO: Describe file outputs (e.g. we store the instance-level predictions in a separate file from the one where the raw model output and ground truths are.)


Split into different cells for OpenAI embeddings, word2vec, fasttext and n-grams (to avoid memory issues).

OpenAI embeddings

In [None]:
for llm in llms:
    if len(train_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the train df"
        )
        continue

    if len(validation_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the validation df"
        )
        continue

    if len(test_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the test df"
        )
        continue

    for predictive_method, kwargs, pred_method_name in predictive_method_list:
        assessors_results_df = evaluate_and_update(
                assessors_results_df,
                train_df,
                validation_df,
                test_df,
                ["openai_embeddings"],
                predictive_method,
                pred_method_name,
                "openai",
                llm,
                filename,
                **kwargs,
            )

Word2vec + fasttext

In [None]:
for llm in llms:
    if len(train_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the train df"
        )
        continue

    if len(validation_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the validation df"
        )
        continue

    if len(test_df[f"Success_{llm}"].unique()) < 2:
        print(
            f"Skipping {llm} because there is only one value in the 'Success' column for the test df"
        )
        continue

    for embedding_type in ["word2vec", "fasttext"]:

        for predictive_method, kwargs, pred_method_name in predictive_method_list:

            assessors_results_df = evaluate_and_update(
                assessors_results_df,
                train_df,
                validation_df,
                test_df,
                [
                    f"{embedding_type}_embeddings"
                ],  # TODO: Could also include the response options as a feature
                predictive_method,
                pred_method_name,
                embedding_type,
                llm,
                filename,
                **kwargs,
            )

ngrams

In [None]:
for n_gram_size in [1]:
    # compute the n-grams
    X_train_ngrams, X_val_ngrams, X_test_ngrams, vectorizer = ngram_vectorize_new(
        train_df["prompt"],
        validation_df["prompt"],
        test_df["prompt"],
        ngram_range=(1, n_gram_size),
    )

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(
                f"Skipping {llm} because there is only one value in the 'Success' column for the train df"
            )
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(
                f"Skipping {llm} because there is only one value in the 'Success' column for the validation df"
            )
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(
                f"Skipping {llm} because there is only one value in the 'Success' column for the test df"
            )
            continue

        # select the features (this depends on which LLM you are considering)
        (
            X_train_ngrams_selected,
            X_val_ngrams_selected,
            X_test_ngrams_selected,
            selector,
        ) = select_features(
            X_train_ngrams, train_df[f"Success_{llm}"], X_val_ngrams, X_test_ngrams
        )

        for predictive_method, kwargs, pred_method_name in predictive_method_list:

            assessors_results_df = evaluate_and_update_arrays(
                assessors_results_df,
                X_train_ngrams_selected,
                train_df[f"Success_{llm}"],
                X_val_ngrams_selected,
                validation_df[f"Success_{llm}"],
                X_test_ngrams_selected,
                test_df[f"Success_{llm}"],
                predictive_method,
                pred_method_name,
                f"ngrams_{n_gram_size}",
                llm,
                filename,
                **kwargs,
            )