1. Read Directory of experiments to compare
2. Compute Rouge for each experiment
3. Keep best score
4. Print best in a table

In [31]:
from rouge import Rouge

rouge = Rouge(exclusive=False)

def rougel_score(prediction, ground_truth):
    # no normalization
    try:
        scores = rouge.get_scores(prediction, ground_truth, avg=True)
    except ValueError:  # "Hypothesis is empty."
        return 0.0, 0.0
    return scores["rouge-l"]["f"], scores["rouge-1"]["f"]

import datasets
hf_rouge = datasets.load_metric('rouge')

In [2]:
# Gold Data

import json
ELI5 = {}
NQ = {}

with open("/dccstor/srosent2/generative/eli5-sample/5-ELI5-train-examples-for-evaluation.jsonl",'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        ELI5[data['id']] = data
with open("/dccstor/srosent2/generative/appen/NQ_sample.json",'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        NQ[data['id']] = data

In [29]:
import glob
import pandas as pd


def get_best_params(baseline_dir, ref):
    best_rouge = 0
    best_df = None
    best_name = ""

    baseline_files = glob.glob(baseline_dir)

    for baseline_file in baseline_files:
        baseline_df = pd.read_json(baseline_file, lines=True, dtype=str)
        baseline_df['hf_rouge'] = 0.0

        for i, row in baseline_df.iterrows():
            hf_rouge_score = hf_rouge.compute(predictions=[row["text"]],references=[ref[row["id"]]['output'][0]['answer']])
            kilt_rouge_score, _ = rougel_score(row["text"],ref[row["id"]]['output'][0]['answer'])
            baseline_df.loc[i, 'hf_rouge'] = hf_rouge_score["rougeLsum"].mid.fmeasure
            baseline_df.loc[i, 'kilt_rouge'] = kilt_rouge_score
            
        score = baseline_df['hf_rouge'].mean()
        if score > best_rouge:
            best_rouge = score
            best_df = baseline_df
            best_name = baseline_file

    return best_rouge, best_name, best_df

In [22]:

def get_rouge_table(dataset, ref):
    llms = ["google-flan-t5-xxl", "bigscience-bloomz", "bigscience-bloom", "eleutherai-gpt-neox-20b"]

    baseline_dir = "/dccstor/srosent2/generative/baseline_llms/" + dataset
    best_examples = []

    print("experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length")
    for llm in llms: 
        baseline_fnames = baseline_dir + "/" + llm + "/*/predictions*.json"

        best_rouge, best_name, best_df = get_best_params(baseline_fnames, ref)

        params = best_name[len(baseline_dir + llm + "/"):best_name.rindex("/")].split("-")

        print_output = llm + "|" + str(best_rouge) + "|"
        print_output += params[1].split("_")[1] + "|"
        print_output += params[2][0] + "|"
        pktemp = params[3].split("_")
        print_output += pktemp[0] + "|" + pktemp[1] + "|" + pktemp[2] + "|"
        minmax = params[4].split("_")
        print_output += minmax[1] + "|" + minmax[2]
        print(print_output)
        for i, row in best_df.iterrows():
            best_examples.append([llm, str(row['id']), row['question'], row['text'], str(row['rouge']), str(row['hf_rouge']), str(row['kilt_rouge'])])
    return best_examples

In [32]:
import csv

with open('/dccstor/srosent2/generative/baseline_llms/NQ/best_updated.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(get_rouge_table("NQ", NQ))
with open('/dccstor/srosent2/generative/baseline_llms/ELI5/best_updated.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(get_rouge_table("ELI5", ELI5))

experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length
google-flan-t5-xxl|0.5212973087322894|True|0|0.5|100|1.0|50|1024
bigscience-bloomz|0.44545249836421624|True|0|1.0|100|0.5|50|1024
bigscience-bloom|0.14787958830481615|True|0|0.25|100|1.0|0|1024
eleutherai-gpt-neox-20b|0.06925018935010438|True|0|1.0|100|0.75|100|1024
experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length
google-flan-t5-xxl|0.19292226292226294|True|0|1.0|100|1.0|50|1024
bigscience-bloomz|0.22076311751313557|True|0|1.0|100|0.5|50|1024
bigscience-bloom|0.1049867513622758|True|0|0.5|100|1.0|50|1024
eleutherai-gpt-neox-20b|0.14495064747492892|True|0|0.75|100|1.0|0|1024


experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length
google-flan-t5-xxl|0.25508641580641983|True|0|0.25|100|0.5|200|1024
bigscience-bloomz|0.2541450173173494|True|0|1.0|100|0.75|100|1024
bigscience-bloom|0.22344075436693506|True|0|0.75|100|0.5|50|1024
eleutherai-gpt-neox-20b|0.16836206596932884|True|0|0.5|100|1.0|0|1024