In [42]:
import pandas as pd
import sys
sys.path.append("..")
from src.Eval import eval_llm

import os
import io
from contextlib import redirect_stdout


In [43]:
import sys
sys.path.append("..")
from src.Eval import eval_llm

import os
import io
from contextlib import redirect_stdout

def clean_dict(d):
    """
    Recursively remove keys from a dictionary if they map to empty lists.
    """
    if isinstance(d, dict):
        return {k: clean_dict(v) for k, v in d.items() if clean_dict(v) != {} and v != []}
    elif isinstance(d, list):
        return [clean_dict(item) for item in d if clean_dict(item) != {} and item != []]
    else:
        return d

def parse_filename(filename):
    parts = filename.replace(".jsonl", "").split("_")
    dataset = parts[0]
    k_shot = parts[1].split(".")[0]
    k = int(k_shot.replace("shot", ""))
    if k and k != 0 and len(parts) == 3:
        sampling_method = parts[2]
    else:
        sampling_method = "rand"
    return dataset, k, sampling_method

def silent_eval_llm(fpath):
    with io.StringIO() as buf, redirect_stdout(buf):
        # Call the actual eval_llm function
        return eval_llm(fpath)
    
def aggregated_results_llm(
    results_dir,
    ent_types=["overall", "Artist", "WoA"],
    eval_schemas=["strict", "exact", "ent_type"],
    metrics=["f1", "f1_macro", "f1_micro", "precision", "precision_macro", "recall", "recall_macro", "missed", "spurious", "incorrect"],
    datasets=["dataset1", "dataset2", "dataset3", "dataset4"],
    sampling_methods=["rand", "tfidf", ""],
    ks=[0,5,15,25,35,45]
):
    results = {}
    for schema in eval_schemas:
        results[schema] = {}
        for ent_type in ent_types:
            results[schema][ent_type] = {}
            for metric in metrics:
                results[schema][ent_type][metric] = {}
                for model in [m for m in os.listdir(results_dir) if m != "archive"]:
                    results[schema][ent_type][metric][model] = {}
                    for sampling_method in sampling_methods:
                        results[schema][ent_type][metric][model][sampling_method] = {}
                        for k in ks:
                            results[schema][ent_type][metric][model][sampling_method][k] = []

    model_dirs = [
        os.path.join(results_dir, d)
        for d in os.listdir(results_dir)
        if os.path.isdir(os.path.join(results_dir, d)) and d != "archive"
    ]

    for model_dir in model_dirs:
        model = model_dir.split(os.sep)[-1]
        files = [
            os.path.join(model_dir, f)
            for f in os.listdir(model_dir)
            if os.path.isfile(os.path.join(model_dir, f))
        ]

        for fpath in files:
            print(fpath)
            filename = os.path.basename(fpath)
            dataset, k, sampling_method = parse_filename(filename)

            if dataset in datasets and k in ks:
                predictions = silent_eval_llm(fpath)

                for ent_type in ent_types:
                    for schema in eval_schemas:
                        for metric in metrics:
                            key = f"{ent_type}_{schema}_{metric}"
                            if key in predictions.keys():
                                results[schema][ent_type][metric][model][sampling_method][k].append(predictions[key])
    return clean_dict(results)

results = aggregated_results_llm("../output/reddit+shsyt/")
#results_tfidf = aggregated_results_llm("../output/tfidf_sampling")



2024-09-06 08:14:02 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset2_5shot_tfidf.jsonl


2024-09-06 08:14:02 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_35shot_tfidf.jsonl


2024-09-06 08:14:02 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset2_15shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset1_25shot_tfidf.jsonl


2024-09-06 08:14:02 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:03 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset3_5shot_tfidf.jsonl


2024-09-06 08:14:03 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset5_5shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset2_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset5_15shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset4_25shot_tfidf.jsonl


2024-09-06 08:14:03 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:03 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset5_5shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset5_35shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset1_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset4_0shot.jsonl


2024-09-06 08:14:04 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:04 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_0shot.jsonl


2024-09-06 08:14:04 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset5_0shot.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset3_15shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset2_25shot_rand.jsonl


2024-09-06 08:14:04 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:04 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset3_25shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset2_5shot_rand.jsonl


2024-09-06 08:14:05 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:05 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_15shot_tfidf.jsonl


2024-09-06 08:14:05 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset2_15shot_rand.jsonl


2024-09-06 08:14:05 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset5_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset5_15shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset1_5shot_tfidf.jsonl


2024-09-06 08:14:06 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_15shot_rand.jsonl


2024-09-06 08:14:06 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset4_25shot_rand.jsonl


2024-09-06 08:14:06 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:06 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset3_0shot.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset3_35shot_tfidf.jsonl


2024-09-06 08:14:06 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:07 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset2_0shot.jsonl


2024-09-06 08:14:07 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset5_25shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset3_5shot_rand.jsonl


2024-09-06 08:14:07 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_5shot_rand.jsonl


2024-09-06 08:14:07 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_15shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset3_25shot_rand.jsonl


2024-09-06 08:14:07 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:08 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_35shot_tfidf.jsonl


2024-09-06 08:14:08 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1/dataset1_25shot_rand.jsonl


2024-09-06 08:14:08 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset3_15shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset2_35shot_tfidf.jsonl


2024-09-06 08:14:08 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:09 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_5shot_tfidf.jsonl


2024-09-06 08:14:09 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_15shot_rand.jsonl


2024-09-06 08:14:09 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset4_5shot_rand.jsonl


2024-09-06 08:14:09 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1/dataset3_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset5_25shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1/dataset2_25shot_tfidf.jsonl


2024-09-06 08:14:09 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:10 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_5shot_tfidf.jsonl


2024-09-06 08:14:10 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_15shot_tfidf.jsonl


2024-09-06 08:14:10 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_25shot_tfidf.jsonl


2024-09-06 08:14:10 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset3_5shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset2_35shot_rand.jsonl


2024-09-06 08:14:11 root INFO: Imported 660 predictions for 660 true examples
2024-09-06 08:14:11 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset5_15shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset5_5shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset1_35shot_rand.jsonl


2024-09-06 08:14:11 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset4_0shot.jsonl


2024-09-06 08:14:11 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_0shot.jsonl


2024-09-06 08:14:11 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset5_0shot.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset2_25shot_rand.jsonl


2024-09-06 08:14:12 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_5shot_rand.jsonl


2024-09-06 08:14:12 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_15shot_tfidf.jsonl


2024-09-06 08:14:12 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_15shot_rand.jsonl


2024-09-06 08:14:12 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset5_35shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset1_5shot_tfidf.jsonl


2024-09-06 08:14:12 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_15shot_rand.jsonl


2024-09-06 08:14:13 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset4_35shot_rand.jsonl


2024-09-06 08:14:13 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset4_25shot_rand.jsonl


2024-09-06 08:14:13 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset3_0shot.jsonl


2024-09-06 08:14:13 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_0shot.jsonl


2024-09-06 08:14:14 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset5_25shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset3_5shot_rand.jsonl


2024-09-06 08:14:14 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_5shot_rand.jsonl


2024-09-06 08:14:14 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset3_25shot_rand.jsonl


2024-09-06 08:14:14 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_35shot_tfidf.jsonl


2024-09-06 08:14:14 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset1_25shot_rand.jsonl
../output/reddit+shsyt_backup/llama3.1:70b/dataset3_15shot_rand.jsonl


2024-09-06 08:14:15 root INFO: Imported 658 predictions for 658 true examples
2024-09-06 08:14:15 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_35shot_tfidf.jsonl


2024-09-06 08:14:15 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset4_15shot_rand.jsonl


2024-09-06 08:14:15 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset4_5shot_rand.jsonl


2024-09-06 08:14:16 root INFO: Imported 658 predictions for 658 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset3_35shot_rand.jsonl


2024-09-06 08:14:16 root INFO: Imported 660 predictions for 660 true examples


../output/reddit+shsyt_backup/llama3.1:70b/dataset2_25shot_tfidf.jsonl
../output/reddit+shsyt_backup/llama3.1-8b/dataset2_15shot3.jsonl
../output/reddit+shsyt_backup/llama3.1-8b/dataset2_15shot2.jsonl


In [44]:
import pandas as pd
import numpy as np

def results_to_dataframe(aggregated_results, agg_func='mean'):
    data = []

    for schema, schema_dict in aggregated_results.items():
        for ent_type, ent_type_dict in schema_dict.items():
            for metric, metric_dict in ent_type_dict.items():
                for model, model_dict in metric_dict.items():
                    for sampling, sampling_dict in model_dict.items():
                        for k_shot, values in sampling_dict.items():
                            if values:
                                if agg_func == 'mean':
                                    agg_value = np.mean(values)
                                elif agg_func == 'sum':
                                    agg_value = np.sum(values)
                                else:
                                    raise ValueError("Invalid aggregation function. Use 'mean' or 'sum'.")
                                
                                data.append([schema, ent_type, metric, model, sampling, k_shot, agg_value])

    df = pd.DataFrame(data, columns=['Schema', 'Entity Type', 'Metric', 'Model', 'Sampling', 'k', 'Value'])
    df.Model = df.Model.str.replace("mistral", "Mistral-7B").str.replace("mixtral", "Mixtral-8x22B").str.replace("gpt-3.5-turbo-0125", "GPT-3.5-Turbo")
    return df.set_index(['Schema', 'Entity Type', 'Metric', 'Model', 'Sampling', 'k']).unstack(['Schema', 'Entity Type', 'Metric'])

results_to_dataframe(results)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Schema,strict,strict,strict,strict,strict,strict,strict,strict,strict,strict,...,ent_type,ent_type,ent_type,ent_type,ent_type,ent_type,ent_type,ent_type,ent_type,ent_type
Unnamed: 0_level_2,Unnamed: 1_level_2,Entity Type,overall,overall,overall,overall,Artist,Artist,Artist,Artist,Artist,Artist,...,Artist,Artist,Artist,Artist,WoA,WoA,WoA,WoA,WoA,WoA
Unnamed: 0_level_3,Unnamed: 1_level_3,Metric,f1_macro,f1_micro,precision_macro,recall_macro,f1,precision,recall,missed,spurious,incorrect,...,recall,missed,spurious,incorrect,f1,precision,recall,missed,spurious,incorrect
Model,Sampling,k,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
llama3.1,rand,0,0.738178,0.740651,0.737136,0.739706,0.77188,0.777775,0.766327,64.5,59.25,16.5,...,0.791258,64.5,59.25,8.0,0.768296,0.759811,0.777451,64.5,71.5,3.0
llama3.1,rand,5,0.735262,0.736457,0.712722,0.762472,0.771534,0.775654,0.767728,60.5,57.0,19.75,...,0.792582,60.5,57.0,11.25,0.770315,0.716207,0.83433,46.25,96.25,4.25
llama3.1,rand,15,0.735549,0.737186,0.725281,0.747987,0.765952,0.776771,0.755574,65.25,55.75,19.5,...,0.778892,65.25,55.75,11.5,0.772292,0.73818,0.810616,53.5,83.0,4.25
llama3.1,rand,25,0.690214,0.690983,0.705625,0.678611,0.71169,0.761467,0.668204,96.75,54.0,18.5,...,0.696966,96.75,54.0,8.5,0.735849,0.715169,0.757963,68.5,86.5,5.0
llama3.1,rand,35,0.631078,0.631993,0.703716,0.576303,0.663816,0.782045,0.576838,130.5,39.0,16.75,...,0.599538,130.5,39.0,9.0,0.668407,0.699453,0.642454,105.25,80.75,3.75
llama3.1,tfidf,5,0.721007,0.722298,0.696854,0.750226,0.761278,0.765257,0.757735,64.75,61.5,19.5,...,0.785299,64.75,61.5,10.0,0.741063,0.684174,0.808493,54.25,109.5,3.75
llama3.1,tfidf,15,0.701056,0.702488,0.717145,0.687447,0.723091,0.760302,0.689777,89.5,57.25,18.25,...,0.716133,89.5,57.25,9.25,0.738166,0.733176,0.744281,74.25,78.5,3.75
llama3.1,tfidf,25,0.68892,0.690323,0.711523,0.669867,0.719175,0.771158,0.673844,94.5,50.5,18.75,...,0.704682,94.5,50.5,8.25,0.726484,0.719132,0.734319,77.25,83.5,3.5
llama3.1,tfidf,35,0.673498,0.676286,0.7175,0.6354,0.72235,0.78465,0.669429,98.75,47.5,16.25,...,0.695999,98.75,47.5,7.25,0.679478,0.707649,0.653965,101.25,78.25,3.75
llama3.1:70b,rand,0,0.725325,0.726048,0.684567,0.777178,0.771081,0.763302,0.779171,46.75,54.0,30.0,...,0.801547,46.75,54.0,22.25,0.744471,0.664101,0.848518,44.75,129.0,1.5


In [45]:
results

{'strict': {'overall': {'f1_macro': {'llama3.1': {'rand': {0: [0.7542972816657028,
       0.746309369630175,
       0.7223565048704834,
       0.7297473433358461],
      5: [0.7547052053824364,
       0.7200503978322028,
       0.7431342298363575,
       0.7231579041320249],
      15: [0.727951126915688,
       0.7385512994468219,
       0.7351003756312346,
       0.7405943604165084],
      25: [0.6992488216810683,
       0.7087946771304274,
       0.6718512647777557,
       0.6809617658651952],
      35: [0.6212442666343376,
       0.6221243153131697,
       0.6363492383448037,
       0.6445937158879935]},
     'tfidf': {5: [0.7148850237769526,
       0.7166666666666667,
       0.699407114624506,
       0.7530699088145897],
      15: [0.7095959595959596,
       0.7054261121795882,
       0.6846145288686172,
       0.704587869362364],
      25: [0.6797750921248642,
       0.700946369093919,
       0.6746543526968687,
       0.7003048311901224],
      35: [0.6910242982823629,
       0.6

# Precision

In [46]:
_results = results_to_dataframe(results, "precision", "mean").reset_index()
_results[(_results.Schema == "strict") & (_results.k == 0)].drop(["Schema", "k"], axis=1)


TypeError: results_to_dataframe() takes from 1 to 2 positional arguments but 3 were given

# Recall

In [None]:
_results = results_to_dataframe(results, "recall", "mean").reset_index()
_results[(_results.Schema == "strict") & (_results.k == 0)].drop(["Schema", "k"], axis=1)


Unnamed: 0,Entity Type,Model,Value
0,Artist,llama3.1,0.756232
5,Artist,llama3.1-70b,0.779171
6,WoA,llama3.1,0.716194
11,WoA,llama3.1-70b,0.775184


# F1

In [None]:
results_to_dataframe(results, "f1_macro", "mean")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value
Schema,Entity Type,Model,k,Unnamed: 4_level_1
strict,overall,llama3.1,0,0.736789
strict,overall,llama3.1,5,0.736789
strict,overall,llama3.1,15,0.736789
strict,overall,llama3.1,25,0.736789
strict,overall,llama3.1,35,0.736789
strict,overall,llama3.1-70b,0,0.725325
exact,overall,llama3.1,0,0.751727
exact,overall,llama3.1,5,0.751727
exact,overall,llama3.1,15,0.751727
exact,overall,llama3.1,25,0.751727


In [None]:
results_to_dataframe(results, "f1", "mean")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value
Schema,Entity Type,Model,k,Unnamed: 4_level_1
strict,Artist,llama3.1-70b,0,0.771081
strict,Artist,llama3.1-8b,0,0.763753
strict,WoA,llama3.1-70b,0,0.67957
strict,WoA,llama3.1-8b,0,0.709825
exact,Artist,llama3.1-70b,0,0.833947
exact,Artist,llama3.1-8b,0,0.784851
exact,WoA,llama3.1-70b,0,0.683969
exact,WoA,llama3.1-8b,0,0.718603
ent_type,Artist,llama3.1-70b,0,0.793239
ent_type,Artist,llama3.1-8b,0,0.788424
