## Notebook to compute average over all folds of the test results

**!!! Please first run 'test_all.sh' !!!**

In [None]:
import re
import os
import json

def get_metrics(path):
    with open(path, "r") as f:
        text = f.read()

    l_acc, q_acc = re.findall(r'Accuracy: (\d+.\d+)', text)

    l_f1_fake, q_f1_fake = re.findall(r'F1: (\d+.\d+)', text)
    
    l_f1_0 = re.findall(r'F1: \d+.\d+\n\t0: (\d+.\d+)', text)[0]
    l_f1_1 = re.findall(r'F1: \d+.\d+\n\t0: \d+.\d+, 1: (\d+.\d+)', text)[0]

    l_f1 = (float(l_f1_0) + float(l_f1_1)) / 2

    f1_all, f1_most, _, f1_few, f1_none = re.findall(r'F1: \d+.\d+\n\tall: (\d+.\d+), most: (\d+.\d+), half: (\d+.\d+), few: (\d+.\d+), none: (\d+.\d+)', text)[0]

    q_f1 = sum((float(f1_all), float(f1_most), float(f1_few), float(f1_none))) / 4
    
    return {
        "l_acc": float(l_acc),
        "l_f1": float(l_f1),
        "l_f1_0": float(l_f1_0),
        "l_f1_1": float(l_f1_1),
        "q_f1": float(q_f1),
        "q_acc": float(q_acc),

        "f1_all": float(f1_all),
        "f1_most": float(f1_most),
        "f1_few": float(f1_few),
        "f1_none": float(f1_none),
    }

def get_filenames(task, model, base_path="./eval/"):
    files = [f for f in os.listdir(base_path) if f.endswith(".log")]
    targets = []    
    for file in files:
        if file.startswith(f"EVAL-TASK-{task}") and model in file:
            targets.append(os.path.join("./eval/", file))
    return targets

def average_dicts(dicts):
    # Initialize an empty dictionary for storing the sums
    sum_dict = {key: 0 for key in dicts[0].keys()}
    n = len(dicts)

    # Sum the values for each key
    for d in dicts:
        for key, value in d.items():
            # If the value is a number, add it to the sum
            if isinstance(value, (int, float)):
                sum_dict[key] += value

    # Calculate the average by dividing the sum by the number of dictionaries
    avg_dict = {key: (sum_val / n) for key, sum_val in sum_dict.items()}

    return avg_dict


def main(task):
    MODELS = {
        "base-base": [], 
        "OpenRLHF-Llama-3-8b-sft-mixture": [],
        "instruct-instruct": [], 
        "OpenRLHF-Llama-3-8b-rlhf-100k": []
    }

    for m in MODELS:
        files = get_filenames(task=task, model=m)
        for f in files:
            metrics = get_metrics(f)

            if task == 1:
                metrics.pop('q_acc', None)
                metrics.pop('q_f1', None)
                metrics.pop('f1_all', None)
                metrics.pop('f1_most', None)
                metrics.pop('f1_few', None)
                metrics.pop('f1_none', None)

            MODELS[m].append(metrics)

        MODELS[m] = average_dicts(MODELS[m])
        MODELS[m]["NUM_FOLDS"] = len(files)

    return MODELS


task_1 = main(task=1)
task_2 = main(task=2)

---

### Evaluation of Task 1

In [8]:
print(json.dumps(task_1, indent=4))

{
    "base-base": {
        "l_acc": 0.7741247218824493,
        "l_f1": 0.7727582123378466,
        "l_f1_0": 0.7893992498309045,
        "l_f1_1": 0.7561171748447887,
        "NUM_FOLDS": 3
    },
    "OpenRLHF-Llama-3-8b-sft-mixture": {
        "l_acc": 0.7705968856942592,
        "l_f1": 0.7698444172838114,
        "l_f1_0": 0.7812458316659997,
        "l_f1_1": 0.7584430029016228,
        "NUM_FOLDS": 3
    },
    "instruct-instruct": {
        "l_acc": 0.7214571603529975,
        "l_f1": 0.7012790159433541,
        "l_f1_0": 0.7888364519512061,
        "l_f1_1": 0.6137215799355019,
        "NUM_FOLDS": 3
    },
    "OpenRLHF-Llama-3-8b-rlhf-100k": {
        "l_acc": 0.6939149330609041,
        "l_f1": 0.6554447694469371,
        "l_f1_0": 0.7810096153846153,
        "l_f1_1": 0.5298799235092587,
        "NUM_FOLDS": 3
    }
}


---

### Evaluation of Task 2

In [9]:
print(json.dumps(task_2, indent=4))

{
    "base-base": {
        "l_acc": 0.6366850983967297,
        "l_f1": 0.5601750485735157,
        "l_f1_0": 0.7433908494981288,
        "l_f1_1": 0.3769592476489028,
        "q_f1": 0.23643133247976209,
        "q_acc": 0.374741199879667,
        "f1_all": 0.06171735241502683,
        "f1_most": 0.2110857046070461,
        "f1_few": 0.08100645637336285,
        "f1_none": 0.5919158165236126,
        "NUM_FOLDS": 3
    },
    "OpenRLHF-Llama-3-8b-sft-mixture": {
        "l_acc": 0.6730218722178701,
        "l_f1": 0.6276483213627975,
        "l_f1_0": 0.7571155490080489,
        "l_f1_1": 0.498181093717546,
        "q_f1": 0.27182694973698834,
        "q_acc": 0.39555775453577846,
        "f1_all": 0.022473675939022473,
        "f1_most": 0.2869766754204306,
        "f1_few": 0.15869023784644434,
        "f1_none": 0.6191672097420561,
        "NUM_FOLDS": 3
    },
    "instruct-instruct": {
        "l_acc": 0.6695028930333558,
        "l_f1": 0.6544961877020516,
        "l_f1_0": 0.