In [62]:
!pip3 install rouge -q
!pip3 install evaluate -q
!pip3 install rouge_score -q

In [118]:
# Import block
from rouge import Rouge
import json
import datasets
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
import evaluate

In [119]:
def parse_data(t_split='train'):

  # Split handling - validation set further split into 50% dev/test.
  if t_split == 'train':
    df = pd.DataFrame(load_dataset('squad')['train'])
  elif t_split in ['val','test']:
    vt_df = pd.DataFrame(load_dataset('squad')['validation'])
    df_val = vt_df.sample(frac=0.5,random_state=266)
    if t_split == 'test':
      df_test = vt_df.drop(df_val.index)
      df = df_test
    else:
      df = df_val
  else:
    raise Exception("Invalid choice of dataset split.")
  

  df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
  df['source'] = 'answer: ' + df['answer_text'] + ' context: ' + df['context'] + '</s>'
  df['target'] = df['question']

  return df  

In [34]:
with open('reference_dict_base256.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('prediction_dict_base256.json', 'r') as fp:
    prediction_dict = json.load(fp)
    
val_df = parse_data('val')

Reusing dataset squad (/home/ec2-user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [120]:
# List possible evaluation metrics
from datasets import list_metrics

metrics_list = list_metrics()
len(metrics_list)
print(metrics_list)

['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae', 'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou', 'meteor', 'mse', 'pearsonr', 'perplexity', 'poseval', 'precision', 'recall', 'rl_reliability', 'roc_auc', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'trec_eval', 'wer', 'wiki_split', 'xnli', 'xtreme_s', 'angelina-wang/directional_bias_amplification', 'codeparrot/apps_metric', 'cpllab/syntaxgym', 'daiyizheng/valid', 'erntkn/dice_coefficient', 'hack/test_metric', 'jordyvl/ece', 'kaggle/ai4code', 'kaggle/amex', 'loubnabnl/apps_metric2', 'lvwerra/bary_score', 'lvwerra/test', 'mfumanelli/geometric_mean', 'mgfrantz/roc_auc_macro', 'yzha/ctc_eval']


In [193]:
def score_preds(reference_dict, prediction_dict, val_df):
    '''
    Returns the results of metrics tests on the entire prediction set and scores each individual prediction.
    
    The output is a pandas DataFrame with line-by-line scores and a dictionary with set scores.
    '''
        
    # Load metrics
    metrics = evaluate.combine(['bleu', 'sacrebleu', 'meteor', 'rouge'])
    
    # Predictions and baseline
    refs = [reference_dict['values'][x]['target'][0] for x in range(0,len(reference_dict['values']))]
    preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
    answers = val_df['answer_text'].to_list()
    context = val_df['context'].to_list()
    
    # Evaluate metrics
    scores = [metrics.compute(predictions = [preds[x]], references= [refs[x]]) for x in range(0,len(reference_dict['values']))]
    bleus = [x['bleu'] for x in scores]
    sacrebleus = [x['score'] for x in scores]
    meteors = [x['meteor'] for x in scores]
    rouges = [x['rougeL'] for x in scores]
    
    # Contruct dataframe with results
    df = pd.DataFrame(list(zip(refs,preds,answers,context, bleus, sacrebleus, meteors, rouges)
        ), columns = ['Reference', 'Prediction', 'Answer', 'Context', 'BLEU', 'SacreBLEU', 'METEOR', 'ROUGE'])
    df['Answer_Contamination'] = df.apply(lambda x: str(x['Answer']) in str(x['Prediction']), axis=1)
    
    # Get results for enitre set.
    refs = [[reference_dict['values'][x]['target'][0]] for x in range(0,len(reference_dict['values']))]
    preds = [[prediction_dict['values'][x]['generated'].split('\t')[0]] for x in range(0,len(prediction_dict['values']))]
    for ref, pred in zip(refs, preds):
        metrics.add_batch(references=ref, predictions=pred)
    results = metrics.compute()                          
                       
    return df, results

In [195]:
def score_set(reference_dict, prediction_dict, val_df):
    '''
    Returns the results of metrics tests on the entire prediction set.
    
    Does not score individual predictions.
    '''
    
    # Load metrics
    metrics = evaluate.combine(['bleu', 'sacrebleu', 'meteor', 'rouge'])
    
    # Get results for enitre set.
    refs = [[reference_dict['values'][x]['target'][0]] for x in range(0,len(reference_dict['values']))]
    preds = [[prediction_dict['values'][x]['generated'].split('\t')[0]] for x in range(0,len(prediction_dict['values']))]
    for ref, pred in zip(refs, preds):
        metrics.add_batch(references=ref, predictions=pred)
    results = metrics.compute()                          
                       
    return results

In [None]:
# Call function to detailed scores.
df, results = score_preds(reference_dict, prediction_dict, val_df)
results

In [None]:
# Save evaluation
df.to_pickle('df_baseline256.pkl')

In [None]:
# Call function to get top-level scores.
set_results = score_set(reference_dict, prediction_dict, val_df)
set_results

In [None]:
with open('reference_dict_base256.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('prediction_dict_base256.json', 'r') as fp:
    prediction_dict = json.load(fp)

In [None]:
# Run evaluation on multiple prediction sets and save the results.
val_df = parse_data('val')
pred_sets = [
    'prediction_dict_base64.json',
    'prediction_dict_base128.json',
    'prediction_dict_base512.json',
    'prediction_dict_GPP256.json'
]
ref_sets = [
    'reference_dict_base64.json',
    'reference_dict_base128.json',
    'reference_dict_base512.json',
    'reference_dict_GPP256.json'  
]
save_names = [
    'df_baseline64.pkl'
    'df_baseline128.pkl'
    'df_baseline516.pkl'
    'df_GPP256_original.pkl'
]

for x in range(0,len(pred_sets)):
    with open(ref_sets[x], 'r') as fp:
        reference_dict = json.load(fp)
    with open(pred_sets[x], 'r') as fp:
        prediction_dict = json.load(fp)
        
    df, results = score_preds(reference_dict, prediction_dict, val_df)
    print(save_names[x], results)
    
    df.to_pickle(save_names[x])



  0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
