In [1]:
from collections import Counter
from typing import List

import numpy as np
import pandas as pd
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


### Variables

In [30]:
backbone = "cosmo-xl"
run_name = "prepend_later_drop0.5"
inference_mode = "delayed"
assert inference_mode in ["delayed", "immediate"]
test_set = "test_mc_taco"

In [31]:
df = pd.read_csv(f"../results/{backbone}/{run_name}/{test_set}_{inference_mode}.csv")

In [32]:
df.head()

Unnamed: 0,context,reference,model_response
0,What are you doing right now?\t| 0 minute late...,"You were right about the seafood, absolutely a...",Now the ports! Where are you headed next?
1,Are you still out looking for Max?\t| 0 minute...,"Just got in the bath. No luck finding Max, but...","Okay, I'm heading in now."
2,Just finishing dinner. Gonna look for Max befo...,Found Max hiding in the laundry room! Heading ...,Max's in the tub now!
3,What are you doing now?\t| 0 minute later\nJus...,Found him napping in the laundry room. Bath ti...,I finally found Max! He was hiding behind the ...
4,Guess what? I'm on a mission to find Max again...,Found Max! He was in the attic all along.,"Okay, let’s see...he’s hiding under the couch!"


### SacreBLEU, ROUGE, METEOR, Distinct-n

In [33]:
refs_for_bleu = df["reference"].apply(lambda x: [str(x).lower()]).tolist()
refs = df["reference"].apply(lambda x: str(x).lower()).tolist()
hyps = df["model_response"].apply(lambda x: str(x).lower()).tolist()

In [7]:
bleu = load("sacrebleu")
rouge = load("rouge")
meteor = load("meteor")

[nltk_data] Downloading package wordnet to /home/namomo73/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/namomo73/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/namomo73/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
def distinct(seqs: List[str]):
    """Calculate intra/inter distinct 1/2."""
    batch_size = len(seqs)
    intra_dist1, intra_dist2 = [], []
    unigrams_all, bigrams_all = Counter(), Counter()
    for seq in seqs:
        tokenized = seq.split()
        unigrams = Counter(tokenized)
        bigrams = Counter(zip(tokenized, tokenized[1:]))
        intra_dist1.append((len(unigrams)+1e-12) / (len(tokenized)+1e-5))
        intra_dist2.append((len(bigrams)+1e-12) / (max(0, len(tokenized)-1)+1e-5))

        unigrams_all.update(unigrams)
        bigrams_all.update(bigrams)

    # inter: average of all sequences
    inter_dist1 = (len(unigrams_all)+1e-12) / (sum(unigrams_all.values())+1e-5)
    inter_dist2 = (len(bigrams_all)+1e-12) / (sum(bigrams_all.values())+1e-5)
    # intra: average over average of each sequence
    intra_dist1 = np.average(intra_dist1)
    intra_dist2 = np.average(intra_dist2)
    return inter_dist1, inter_dist2, intra_dist1, intra_dist2

In [34]:
bleu_score = bleu.compute(predictions=hyps, references=refs_for_bleu, lowercase=True)
rouge_score = rouge.compute(predictions=hyps, references=refs, rouge_types=["rouge1", "rouge2", "rougeL"], use_stemmer=True)
meteor_score = meteor.compute(predictions=hyps, references=refs)
distinct1, distinct2, _, _ = distinct(hyps)

In [35]:
print(f"model: {backbone}\nsetup: {run_name}\nmode: {inference_mode}\ntest set: {test_set}")
print("=" * 20)
print(f"BLEU: {bleu_score['score']:.2f}")
print(f"ROUGE-1: {rouge_score['rouge1'] * 100:.2f}")
print(f"ROUGE-2: {rouge_score['rouge2'] * 100:.2f}")
print(f"ROUGE-L: {rouge_score['rougeL'] * 100:.2f}")
print(f"METEOR: {meteor_score['meteor'] * 100:.2f}")
print(f"Distinct-1: {distinct1 * 100:.2f}")
print(f"Distinct-2: {distinct2 * 100:.2f}")

model: cosmo-xl
setup: prepend_later_drop0.5
mode: delayed
test set: test_mc_taco
BLEU: 2.36
ROUGE-1: 15.06
ROUGE-2: 3.24
ROUGE-L: 12.78
METEOR: 13.73
Distinct-1: 29.27
Distinct-2: 74.11
