1. Read Directory of experiments to compare
2. Compute Rouge for each experiment
3. Keep best score
4. Print best in a table

In [9]:
from rouge import Rouge

rouge = Rouge()

def rougel_score(prediction, ground_truth):
    # no normalization
    try:
        scores = rouge.get_scores(prediction, ground_truth, avg=True)
    except ValueError:  # "Hypothesis is empty."
        return 0.0, 0.0
    return scores["rouge-l"]["f"], scores["rouge-1"]["f"]

import datasets
hf_rouge = datasets.load_metric('rouge')

In [10]:
# Gold Data

import json
ELI5 = {}
NQ = {}

with open("/dccstor/srosent2/generative/eli5-sample/5-ELI5-train-examples-for-evaluation.jsonl",'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        ELI5[data['id']] = data
with open("/dccstor/srosent2/generative/appen/NQ_sample.json",'r') as f:
    for line in f.readlines():
        data = json.loads(line)
        NQ[data['id']] = data

In [14]:
import glob
import pandas as pd

from primeqa.mrc.metrics.rouge.rouge import ROUGE
rouge = ROUGE()

def get_best_params(baseline_dir, ref):
    best_rouge = 0
    best_df = None
    best_name = ""

    baseline_files = glob.glob(baseline_dir)

    for baseline_file in baseline_files:
        baseline_df = pd.read_json(baseline_file, lines=True, dtype=str)
        baseline_df['hf_rouge'] = 0.0

        for i, row in baseline_df.iterrows():
            hf_rouge_score, kilt_rouge_score = rouge._metric_max_over_ground_truths(row["text"],[ref[row["id"]]['output'][0]['answer']])
            # hf_rouge_score = hf_rouge.compute(predictions=[row["text"]],references=[ref[row["id"]]['output'][0]['answer']])
            # kilt_rouge_score, _ = rougel_score(row["text"],ref[row["id"]]['output'][0]['answer'])
            baseline_df.loc[i, 'hf_rouge'] = hf_rouge_score
            baseline_df.loc[i, 'kilt_rouge'] = kilt_rouge_score
            
        score = baseline_df['hf_rouge'].mean()
        if score > best_rouge:
            best_rouge = score
            best_df = baseline_df
            best_name = baseline_file

    return best_rouge, best_name, best_df

In [3]:

def get_rouge_table(dataset, ref):
    llms = ["google-flan-t5-xxl", "bigscience-bloomz", "bigscience-bloom", "eleutherai-gpt-neox-20b"]

    baseline_dir = "/dccstor/srosent2/generative/baseline_llms/" + dataset
    best_examples = []

    print("experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length")
    for llm in llms: 
        baseline_fnames = baseline_dir + "/" + llm + "/*/predictions*.json"

        best_rouge, best_name, best_df = get_best_params(baseline_fnames, ref)

        params = best_name[len(baseline_dir + llm + "/"):best_name.rindex("/")].split("-")

        print_output = llm + "|" + str(best_rouge) + "|"
        print_output += params[1].split("_")[1] + "|"
        print_output += params[2][0] + "|"
        pktemp = params[3].split("_")
        print_output += pktemp[0] + "|" + pktemp[1] + "|" + pktemp[2] + "|"
        minmax = params[4].split("_")
        print_output += minmax[1] + "|" + minmax[2]
        print(print_output)
        for i, row in best_df.iterrows():
            best_examples.append([llm, str(row['id']), row['question'], row['text'], str(row['rouge']), str(row['hf_rouge']), str(row['kilt_rouge'])])
    return best_examples

In [7]:
import csv

# with open('/dccstor/srosent2/generative/baseline_llms/NQ/best_updated.csv', 'w') as f:
#     write = csv.writer(f)
#     write.writerows(get_rouge_table("NQ", NQ))
# with open('/dccstor/srosent2/generative/baseline_llms/ELI5/best_updated.csv', 'w') as f:
#     write = csv.writer(f)
#     write.writerows(get_rouge_table("ELI5", ELI5))

print(get_rouge_table("NQ", NQ))
print(get_rouge_table("ELI5", ELI5))

experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length
google-flan-t5-xxl|0.5377475251825059|True|0|0.5|100|1.0|50|1024
bigscience-bloomz|0.4761486816833577|True|0|0.75|100|0.75|50|1024
bigscience-bloom|0.15358585440188438|True|0|0.25|100|1.0|0|1024
eleutherai-gpt-neox-20b|0.07525479898082638|True|0|1.0|100|1.0|50|1024
[['google-flan-t5-xxl', '8849568873685505619', 'who has sold more records than anyone in history', "The Beatles are considered the highest - selling band . Elvis Presley is considered the highest - selling individual artist based on sales claims and Rihanna is the highest - selling individual artist based on certified units . ''", '0.8799999950320001', '0.8571428571428571', '0.8799999950320001'], ['google-flan-t5-xxl', '-7702508458750578906', "the most common form of user interface used on pc's today is called a", 'graphical user interfaces and menu - driven interactions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

In [21]:
ChatGPT = {}
davincci = {}

ChatGPT["4ryd2h_0"] = 'The text does not provide an explanation for why smoking preserves food in the section that is mentioned. The mentioned section, "Preservation," explains that smoking is useful for preserving food due to its antimicrobial and antioxidant properties, but it is typically combined with other methods like salt-curing or drying. The section "Genetics" is unrelated to the topic of smoking and food preservation.'
ChatGPT["wyli3_1"] = "The text does not provide information on how fruit flies 'magically' appear."
ChatGPT["5rq6pa_0"] = 'The text explains that some people are "mouth breathers" while others are "nose breathers" due to various causes, including nasal obstruction or partial blockage, as well as habitual or anatomic factors. In most cases, mouth breathing is an adaptation to nasal obstruction. While breathing through both the nose and mouth during exercise is normal, mouth breathing during rest is considered abnormal. The text does not mention anything about the term "mouth-breather" being utilized as an insult to imply low intelligence.'
ChatGPT["5v4xi7_0"] = "The tray in a microwave rotates to reduce the problem of uneven heating in microwaved food by ensuring that the microwave energy is distributed to different parts of the food. This is achieved through a stirrer or a turntable/carousel that turns the food. Turntables may still leave spots, such as the center of the oven, which receive uneven energy distribution."
ChatGPT["64px9o_0"] = "A flight can be overbooked when an airline sells more tickets than the number of available seats on the aircraft. Airlines may do this because they want to ensure that their planes are filled to capacity even if some passengers don't show up. They can use computer algorithms to predict how many passengers will show up, but sometimes they overestimate, leading to an overbooking situation. Overbooking can also happen when airlines intentionally sell more tickets than available seats to increase revenue, but this is not a common practice. In some cases, airlines may cheat passengers by tagging confirmed tickets as no-shows and pocketing the money made by overbooking without offering an alternative flight for overbooked customers."
davincci["4ryd2h_0"] = "\n\nSmoking helps seal the outer layer of the food being cured, making it more difficult for bacteria to enter. It can be done in combination with other curing methods such as salting. Smoke deposits a number of pyrolysis products onto the food, including the phenols syringol, guaiacol and catechol. Salt accelerates the drying process using osmosis and also inhibits the growth of several common strains of bacteria. The combination of smoke and salt helps to preserve food by making it more difficult for bacteria to enter, drying it out to limit bacterial growth, and providing antimicrobial agents that aid in preservation."
davincci["wyli3_1"] = '\n\nFruit flies can "magically" appear when adult flies lay eggs under the skin of the fruit, which then hatch within a few days and grow inside the fruit. The larvae eventually emerge from the fruit, dropping to the ground and pupating in the soil to become mature adult flies.'
davincci["5rq6pa_0"] = 'between the onset of nasal obstruction and the onset of oral breathing, and this can be used to distinguish between nasal and oral breathing.\n\nMouth breathing is caused by nasal obstruction, which can be partial or complete. It can be an adaptation to nasal obstruction, or it can be a habit. Human infants are usually considered obligate nasal breathers, but they can breathe through their mouths if their noses are blocked. People who are able to breathe through their noses during rest will usually breathe through both their nose and mouth during exercise to increase air intake.'
davincci["5v4xi7_0"] = "\n\nThe tray in a microwave rotates to evenly distribute the microwave energy and reduce uneven heating in the food."
davincci["64px9o_0"] = "A flight can be overbooked by predicting the number of passengers who will actually fly after making a reservation and by stimulating pricing for low demand flights. Airlines also sometimes overbook flights if they have given incorrect instructions to travel agents, causing paid tickets to be invalid."

for id in ChatGPT:
    hf_rouge, kilt_rouge = rouge._metric_max_over_ground_truths(ChatGPT[id],[ELI5[id]['output'][0]['answer']])
    print(id + "\t" + str(hf_rouge) + "\t" + str(kilt_rouge))
for id in davincci:
    hf_rouge, kilt_rouge = rouge._metric_max_over_ground_truths(davincci[id],[ELI5[id]['output'][0]['answer']])
    print(id + "\t" + str(hf_rouge) + "\t" + str(kilt_rouge))

4ryd2h_0	0.40366972477064217	0.39999999503391004
wyli3_1	0.25	0.1999999952000001
5rq6pa_0	0.21311475409836067	0.29166666197916674
5v4xi7_0	0.2608695652173913	0.12121211658402223
64px9o_0	0.24203821656050956	0.19999999593888898
4ryd2h_0	0.23129251700680273	0.2499999954607781
wyli3_1	0.2898550724637681	0.32142856706632655
5rq6pa_0	0.2753623188405797	0.29166666197916674
5v4xi7_0	0.28	0.24999999520000007
64px9o_0	0.18604651162790697	0.12987012493843836


In [None]:
ChatGPT = {}
davincci = {}

ChatGPT['8849568873685505619']='As per the text, as of 2017, The Beatles are considered the highest-selling band based on both sales claims and certified units. However, Elvis Presley is considered the highest-selling individual artist based on sales claims, and Rihanna is the highest-selling individual artist based on certified units. Therefore, the answer to the question "who has sold more records than anyone in history" is not mentioned explicitly in the given text.'
ChatGPT['-7702508458750578906']='What is the most common form of user interface used on PCs today? Answer: The most common form of user interface used on PCs today is a graphical user interface (GUI) and menu-driven interactions, as many end users rely on them. However, software developers, system administrators, and advanced users still heavily rely on command-line interfaces for efficiency and accessing features not available through a GUI.'
ChatGPT['-7652766263809313045']='The text suggests that the idea for Walmart came from Sam Walton\'s decision to achieve higher sales volumes by keeping sales prices lower than his competitors by reducing his profit margin. He was also inspired by the successes of other discount department store chains.'
ChatGPT['-1215745899771320061']='The text describes how a counterfeit banknote detection pen works. When the pen is used to mark genuine banknotes, the mark is yellowish or colourless because genuine banknotes are printed on paper based on cotton fibers and do not contain the starches that are reactive with iodine. This property of genuine banknotes allows the pen to detect counterfeit banknotes which may contain such starches, and therefore produce a different mark when marked by the pen. The text mentions that these pens are used to detect counterfeit banknotes of several currencies including Swiss francs, euros, and United States banknotes. The text does not provide information about how to acquire such pens in the UK or the price range of such pens.'
ChatGPT['1756361713228458982']='The text states that there is one area code in Montana, which is area code 406.'
davincci['8849568873685505619']='\n\nThe Beatles are considered the highest-selling band and Elvis Presley is considered the highest-selling individual artist based on sales claims.'
davincci['-7702508458750578906']='\n\nAnswer: Graphical User Interface (GUI)'
davincci['-7652766263809313045']='\n\nThe idea of Walmart came from Sam Walton, inspired by the successes of other discount department store chains. His assistant Bob Bogle came up with the name "Wal-Mart" for the new chain.'
davincci['-1215745899771320061']='\n\nWhen the pen is used to mark counterfeit bills, the mark is typically black or dark blue.'
davincci['1756361713228458982']='\n\nThere is only one area code in Montana: 406.'
for id in ChatGPT:
    hf_rouge, kilt_rouge = rouge._metric_max_over_ground_truths(ChatGPT[id],[ELI5[id]['output'][0]['answer']])
    print(id + "\t" + str(hf_rouge) + "\t" + str(kilt_rouge))
for id in davincci:
    hf_rouge, kilt_rouge = rouge._metric_max_over_ground_truths(davincci[id],[ELI5[id]['output'][0]['answer']])
    print(id + "\t" + str(hf_rouge) + "\t" + str(kilt_rouge))

experiment|rouge|passages|n-shot|top p|top k|temperature|min length|max length
google-flan-t5-xxl|0.25508641580641983|True|0|0.25|100|0.5|200|1024
bigscience-bloomz|0.2541450173173494|True|0|1.0|100|0.75|100|1024
bigscience-bloom|0.22344075436693506|True|0|0.75|100|0.5|50|1024
eleutherai-gpt-neox-20b|0.16836206596932884|True|0|0.5|100|1.0|0|1024