In [1]:
import pandas as pd
import sys
sys.path.append("..")
from src.Eval import eval_llm

import os
import io
from contextlib import redirect_stdout


In [14]:
import sys
sys.path.append("..")
from src.Eval import eval_llm

import os
import io
from contextlib import redirect_stdout

def parse_filename(filename):
    parts = filename.split("_")
    dataset = parts[0]
    k_shot = parts[1].split(".")[0]
    k = int(k_shot.replace("shot", ""))
    if len(parts) == 3:
        sampling_method = parts[2]
    else:
        sampling_method = "random"
    return dataset, k, sampling_method

def silent_eval_llm(fpath):
    with io.StringIO() as buf, redirect_stdout(buf):
        # Call the actual eval_llm function
        return eval_llm(fpath)
    
def aggregated_results_llm(
    results_dir,
    ent_types=["overall", "Artist", "WoA"],
    eval_schemas=["strict", "exact", "ent_type"],
    metrics=["f1", "f1_macro", "precision", "precision_macro", "recall", "recall_macro", "missed", "spurious", "incorrect"],
    datasets=["dataset1", "dataset2", "dataset3", "dataset4"],
    ks=[0,5,15,25,35,45]
):
    results = {}
    for metric in metrics:
        results[metric] = {}
        for schema in eval_schemas:
            results[metric][schema] = {}
            for ent_type in ent_types:
                results[metric][schema][ent_type] = {}
                for model in [m for m in os.listdir(results_dir) if m != "archive"]:
                    results[metric][schema][ent_type][model] = {}
                    for k in ks:
                        results[metric][schema][ent_type][model][k] = []

    model_dirs = [
        os.path.join(results_dir, d)
        for d in os.listdir(results_dir)
        if os.path.isdir(os.path.join(results_dir, d)) and d != "archive"
    ]

    for model_dir in model_dirs:
        model = model_dir.split(os.sep)[-1]
        files = [
            os.path.join(model_dir, f)
            for f in os.listdir(model_dir)
            if os.path.isfile(os.path.join(model_dir, f))
        ]

        for fpath in files:
            filename = os.path.basename(fpath)
            dataset, k, sampling_method = parse_filename(filename)

            if dataset in datasets and k in ks:
                predictions = silent_eval_llm(fpath)

                for ent_type in ent_types:
                    for schema in eval_schemas:
                        for metric in metrics:
                            key = f"{ent_type}_{schema}_{metric}"
                            if key in predictions.keys():
                                results[metric][schema][ent_type][model][k].append(predictions[key])
    return results

results = aggregated_results_llm("../output/reddit+shsyt/")
#results_tfidf = aggregated_results_llm("../output/tfidf_sampling")



2024-09-03 09:02:52 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:52 root INFO: Imported 367 predictions for 367 true examples
2024-09-03 09:02:52 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:52 root INFO: Imported 644 predictions for 644 true examples
2024-09-03 09:02:53 root INFO: Imported 0 predictions for 0 true examples
2024-09-03 09:02:53 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:53 root INFO: Imported 658 predictions for 658 true examples
2024-09-03 09:02:53 root INFO: Imported 644 predictions for 644 true examples
2024-09-03 09:02:53 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:54 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:54 root INFO: Imported 658 predictions for 658 true examples
2024-09-03 09:02:54 root INFO: Imported 660 predictions for 660 true examples
2024-09-03 09:02:54 root INFO: Imported 658 predictions for 658 true

In [15]:
results

{'f1': {'strict': {'overall': {'llama3.1': {0: [],
     5: [],
     15: [],
     25: [],
     35: [],
     45: []},
    'llama3.1-70b': {0: [], 5: [], 15: [], 25: [], 35: [], 45: []}},
   'Artist': {'llama3.1': {0: [0.7401960784313725,
      0.8048452220726784,
      0.7416413373860182,
      0.7683284457478006],
     5: [0.7683284457478006, 0.8048452220726784],
     15: [0, 0.8048452220726784],
     25: [0.8048452220726784],
     35: [0.8048452220726784],
     45: []},
    'llama3.1-70b': {0: [0.7787878787878788,
      0.783641160949868,
      0.7507418397626113,
      0.7711511789181692],
     5: [],
     15: [],
     25: [],
     35: [],
     45: []}},
   'WoA': {'llama3.1': {0: [0.7610208816705337,
      0.6885758998435054,
      0.7018739352640546,
      0.6878306878306879],
     5: [0.6878306878306879, 0.6885758998435054],
     15: [0, 0.6885758998435054],
     25: [0.6885758998435054],
     35: [0.6885758998435054],
     45: []},
    'llama3.1-70b': {0: [0.7125171939477303,
    

In [16]:
import pandas as pd
import numpy as np

def results_to_dataframe(aggregated_results, metric, agg_func='mean'):
    data = []

    for schema, schema_dict in aggregated_results[metric].items():
        for ent_type, ent_type_dict in schema_dict.items():
            for model, model_dict in ent_type_dict.items():
                for k_shot, values in model_dict.items():
                    if values:
                        if agg_func == 'mean':
                            agg_value = np.mean(values)
                        elif agg_func == 'sum':
                            agg_value = np.sum(values)
                        else:
                            raise ValueError("Invalid aggregation function. Use 'mean' or 'sum'.")
                        
                        data.append([schema, ent_type, model, k_shot, agg_value])

    df = pd.DataFrame(data, columns=['Schema', 'Entity Type', 'Model', 'k', 'Value'])
    df.Model = df.Model.str.replace("mistral", "Mistral-7B").str.replace("mixtral", "Mixtral-8x22B").str.replace("gpt-3.5-turbo-0125", "GPT-3.5-Turbo")
    return df.set_index(["Schema", "Entity Type", "Model", "k"])



# Precision

In [17]:
_results = results_to_dataframe(results, "precision", "mean").reset_index()
_results[(_results.Schema == "strict") & (_results.k == 0)].drop(["Schema", "k"], axis=1)


Unnamed: 0,Entity Type,Model,Value
0,Artist,llama3.1,0.771562
5,Artist,llama3.1-70b,0.763302
6,WoA,llama3.1,0.704237
11,WoA,llama3.1-70b,0.605833


# Recall

In [18]:
_results = results_to_dataframe(results, "recall", "mean").reset_index()
_results[(_results.Schema == "strict") & (_results.k == 0)].drop(["Schema", "k"], axis=1)


Unnamed: 0,Entity Type,Model,Value
0,Artist,llama3.1,0.756232
5,Artist,llama3.1-70b,0.779171
6,WoA,llama3.1,0.716194
11,WoA,llama3.1-70b,0.775184


# F1

In [19]:
results_to_dataframe(results, "f1_macro", "mean")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value
Schema,Entity Type,Model,k,Unnamed: 4_level_1
strict,overall,llama3.1,0,0.736789
strict,overall,llama3.1,5,0.737395
strict,overall,llama3.1,15,0.373355
strict,overall,llama3.1,25,0.746711
strict,overall,llama3.1,35,0.746711
strict,overall,llama3.1-70b,0,0.725325
exact,overall,llama3.1,0,0.751727
exact,overall,llama3.1,5,0.754933
exact,overall,llama3.1,15,0.38165
exact,overall,llama3.1,25,0.763299


In [8]:
results_to_dataframe(results, "f1", "mean")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Value
Schema,Entity Type,Model,k,Unnamed: 4_level_1
strict,Artist,llama3.1-70b,0,0.771081
strict,Artist,llama3.1-8b,0,0.763753
strict,WoA,llama3.1-70b,0,0.67957
strict,WoA,llama3.1-8b,0,0.709825
exact,Artist,llama3.1-70b,0,0.833947
exact,Artist,llama3.1-8b,0,0.784851
exact,WoA,llama3.1-70b,0,0.683969
exact,WoA,llama3.1-8b,0,0.718603
ent_type,Artist,llama3.1-70b,0,0.793239
ent_type,Artist,llama3.1-8b,0,0.788424
