In [9]:
import os
import pandas as pd
import numpy as np
from dynaconf import Dynaconf
import matplotlib.pyplot as plt

from elqm import ELQMPipeline
from elqm.eval import Evaluation
from elqm.utils import get_dir
from elqm.eval.oracle import generate_oracle_dataset

from transformers import logging as trf_logging
trf_logging.set_verbosity_error()

In [10]:
RANDOM_SEED = 2024_03_03

# Set to None to generate a new oracle dataset for each index and save it in the cache
DATASET_PATH = [
    os.path.join(get_dir("data", "elqm-raw", "oracle"), "manual_qa_pairs.csv"),
    os.path.join(get_dir("data", "elqm-raw", "oracle"), "gpt4_collection.csv"),
    None
][0]

N_DOCUMENTS = 100  # How many question-answer pairs to generate if DATASET is None

# This creates the name of the directory that the results will be save in: /results/<results_name>/<config_name>.csv
results_name = os.path.splitext(os.path.basename(DATASET_PATH))[0] if DATASET_PATH is not None else f"self_{N_DOCUMENTS}"

In [11]:
print(DATASET_PATH)
print(results_name)

/home/psaegert/Projects/elqm-INLPT-WS2023/data/elqm-raw/oracle/manual_qa_pairs.csv
manual_qa_pairs


In [12]:
eval = Evaluation()

In [13]:
config_results = {}
config_paths = {}

for root, dirs, files in os.walk(get_dir("configs")):
    for file in files:
        if file.endswith(".yaml"):
            config_path = os.path.join(root, file)
            config_paths[file] = config_path

            config = Dynaconf(settings_files=config_path)
            if config.evaluate:
                results_file = os.path.join(get_dir("results", results_name, create=True), file.replace(".yaml", ".csv"))
                if os.path.exists(results_file):
                    config_results[file] = pd.read_csv(results_file)
                else:
                    config_results[file] = None

# Sort the dicts alphabetically
config_paths = dict(sorted(config_paths.items()))
config_results = dict(sorted(config_results.items()))

for k, v in config_results.items():
    print("✅" if v is not None else "🔲", k)

✅ 1024_10_nlc_bge.yaml
✅ 1024_1_nlc_bge.yaml
✅ 1024_5_nlc_bge.yaml
✅ 256_10_10_nlc_bge_fn_mistral.yaml
✅ 256_10_bge.yaml
✅ 256_10_g4a.yaml
✅ 256_10_nlc_bge.yaml
✅ 256_10_nlc_bge_fn.yaml
✅ 256_10_nlc_bge_fn_enrich.yaml
✅ 256_10_nlc_bge_fn_gemma.yaml
✅ 256_10_nlc_bge_fn_mistral.yaml
✅ 256_10_nlc_bge_fn_mistral_enrich.yaml
✅ 256_10_nlc_bge_fn_mixtral.yaml
✅ 256_10_nlc_bge_fn_orca2.yaml
✅ 256_10_nlc_bge_mistral.yaml
✅ 256_10_nlc_g4a.yaml
✅ 256_10_nlc_g4a_fn.yaml
✅ 256_1_nlc_bge.yaml
✅ 256_20_g4a_nomodel.yaml
✅ 256_5_5_nlc_bge_fn.yaml
✅ 256_5_5_nlc_bge_fn_mistral_h1.yaml
✅ 256_5_5_nlc_bge_fn_mistral_h2.yaml
✅ 256_5_5_nlc_bge_fn_mistral_hc1.yaml
✅ 256_5_5_nlc_bge_fn_mistral_hc2.yaml
✅ 256_5_5_nlc_bge_fn_mistral_hc3.yaml
✅ 256_5_5_nlc_bge_fn_mistral_s1.yaml
✅ 256_5_5_nlc_bge_fn_mistral_s2.yaml
✅ 256_5_5_nlc_bge_mistral.yaml
✅ 256_5_nlc_bge.yaml
✅ 256_bm_10_nlc_bge_fn_mistral.yaml
✅ 512_10_nlc_bge.yaml
✅ 512_1_nlc_bge.yaml
✅ 512_5_nlc_bge.yaml
✅ sem_40_nlc_bge_fn.yaml
✅ sem_40_nlc_bge_fn_mistr

In [14]:
mean_results = {k: v.mean() for k, v in config_results.items() if v is not None}

if len(mean_results) > 0:
    mean_results_df = pd.DataFrame(mean_results)
    mean_results_df.loc[['A_BERT_RAG_F1', 'A_BERT_F1', 'A_AR_F1', 'RET_AP_5', 'RET_RR', 'RET_NDCG']].T.sort_values('A_BERT_RAG_F1', ascending=False).round(3)
else:
    print("No results yet.")

In [15]:

def get_evaluation_data(elqm: ELQMPipeline, dataset_path: str | None) -> pd.DataFrame:
    if dataset_path is None:
        print("No dataset specified. Using auto-generated oracle dataset.")
        # Generate the oracle dataset if it doesn't exist
        if not os.path.exists(os.path.join(get_dir("cache", elqm.config.index_name), "oracle_dataset.csv")):
            print(f"Loading documents")
            documents = elqm.loader.load()

            print(f"Sampling {N_DOCUMENTS} documents")
            np.random.seed(RANDOM_SEED)
            sampled_documents = np.random.choice(documents, N_DOCUMENTS, replace=False)

            oracle_df = generate_oracle_dataset(sampled_documents, n_questions_per_type=1, strategy="random", random_seed=RANDOM_SEED, verbose=True)

            oracle_document_df_rows = []
            for document in oracle_df:
                for oracle_pair in document.metadata["oracle_pairs"]:
                    oracle_document_df_rows.append({
                        "source": document.metadata["ID"],
                        "type": oracle_pair["type"],
                        "question": oracle_pair["question"],
                        "answer": oracle_pair["answer"]})

            oracle_df = pd.DataFrame(oracle_document_df_rows)

            print(f"Saving oracle dataset {elqm.config.index_name}")
            oracle_df.to_csv(os.path.join(get_dir("cache", elqm.config.index_name), "oracle_dataset.csv"), index=False)
        else:
            print(f"Loading oracle dataset {elqm.config.index_name}")
            oracle_df = pd.read_csv(os.path.join(get_dir("cache", elqm.config.index_name), "oracle_dataset.csv"))
    else:
        print(f"Using dataset {dataset_path}")
        oracle_df = pd.read_csv(dataset_path)

    return oracle_df

In [16]:
for config_file in config_results.keys():
    config_path = config_paths[config_file]
    config_name = config_file.replace(".yaml", "")

    # If the results already exist, skip
    if config_results[config_file] is not None:
        print(f"Already evaluated {config_name}. Skipping.")
        print('-'*80)
        continue

    print(f"Running {config_name}")
    elqm = ELQMPipeline(config=Dynaconf(settings_files=config_path))
    
    oracle_df = get_evaluation_data(elqm=elqm, dataset_path=DATASET_PATH)

    # Evaluate the pipeline
    results = eval.evaluate(elqm, oracle_df)

    # Flatten the dicts
    results['A_BL_BL'] = [r['bleu'] for r in results['A_BL']]
    results['A_BL_PR'] = [np.mean(r['precisions']) for r in results['A_BL']]
    results['A_BL_BREV'] = [r['brevity_penalty'] for r in results['A_BL']]
    results['A_BL_LR'] = [r['length_ratio'] for r in results['A_BL']]
    results['A_BL_LEN'] = [r['translation_length'] for r in results['A_BL']]
    results['A_BL_REF_LEN'] = [r['reference_length'] for r in results['A_BL']]
    del results['A_BL']

    results['A_RG1'] = [r['rouge1'] for r in results['A_RG']]
    results['A_RG2'] = [r['rouge2'] for r in results['A_RG']]
    results['A_RGL'] = [r['rougeL'] for r in results['A_RG']]
    results['A_RGL_SUM'] = [r['rougeLsum'] for r in results['A_RG']]
    del results['A_RG']

    results['A_BERT_RAG_PR'] = [r['precision'][0] for r in results['A_BERT_RAG']]
    results['A_BERT_RAG_RC'] = [r['recall'][0] for r in results['A_BERT_RAG']]
    results['A_BERT_RAG_F1'] = [r['f1'][0] for r in results['A_BERT_RAG']]
    del results['A_BERT_RAG']

    results['A_BERT_PR'] = [r['precision'][0] for r in results['A_BERT']]
    results['A_BERT_RC'] = [r['recall'][0] for r in results['A_BERT']]
    results['A_BERT_F1'] = [r['f1'][0] for r in results['A_BERT']]
    del results['A_BERT']

    results['A_AR_PR'] = [r[1]['precision'] for r in results['A_AR']]
    results['A_AR_RC'] = [r[1]['recall'] for r in results['A_AR']]
    results['A_AR_F1'] = [r[1]['f1'] for r in results['A_AR']]
    del results['A_AR']

    # Split the k values into separate columns
    for split_column in ['RET_RC', 'RET_PR', 'RET_F1', 'RET_AP']:
        # For each k value
        for k in range(len(results[split_column][0])):
            # New column is named after k and includes the value at that k for all examples
            results[f'{split_column}_{k + 1}'] = [results[split_column][i][k] for i in range(len(results[split_column]))]
        del results[split_column]

    config_results[config_file] = pd.DataFrame(results)
    config_results[config_file].to_csv(os.path.join(get_dir("results", results_name), f"{config_name}.csv"), index=False)

    mean_results = {k: v.mean() for k, v in config_results.items() if v is not None}

    mean_results_df = pd.DataFrame(mean_results)

    print(mean_results_df.loc[['A_BERT_RAG_F1', 'A_BERT_F1', 'A_AR_F1', 'RET_AP_5', 'RET_RR', 'RET_NDCG']].T.sort_values('A_BERT_RAG_F1', ascending=False).round(3))
    print('-'*80)

Already evaluated 1024_10_nlc_bge. Skipping.
--------------------------------------------------------------------------------
Already evaluated 1024_1_nlc_bge. Skipping.
--------------------------------------------------------------------------------
Already evaluated 1024_5_nlc_bge. Skipping.
--------------------------------------------------------------------------------
Already evaluated 256_10_10_nlc_bge_fn_mistral. Skipping.
--------------------------------------------------------------------------------
Already evaluated 256_10_bge. Skipping.
--------------------------------------------------------------------------------
Already evaluated 256_10_g4a. Skipping.
--------------------------------------------------------------------------------
Already evaluated 256_10_nlc_bge. Skipping.
--------------------------------------------------------------------------------
Already evaluated 256_10_nlc_bge_fn. Skipping.
-----------------------------------------------------------------------