In [3]:
!pip3 install rouge -q
!pip3 install evaluate -q
!pip3 install rouge_score -q

In [4]:
# Import block
from rouge import Rouge
import json
import datasets
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
import evaluate

In [5]:
def parse_data(t_split='train'):

  # Split handling - validation set further split into 50% dev/test.
  if t_split == 'train':
    df = pd.DataFrame(load_dataset('squad')['train'])
  elif t_split in ['val','test']:
    vt_df = pd.DataFrame(load_dataset('squad')['validation'])
    df_val = vt_df.sample(frac=0.5,random_state=266)
    if t_split == 'test':
      df_test = vt_df.drop(df_val.index)
      df = df_test
    else:
      df = df_val
  else:
    raise Exception("Invalid choice of dataset split.")
  

  df['answer_text'] = df['answers'].apply(lambda x: x['text'][0])
  df['source'] = 'answer: ' + df['answer_text'] + ' context: ' + df['context'] + '</s>'
  df['target'] = df['question']

  return df  

In [18]:
with open('FINAL_reference_dict_GPP256.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('FINAL_prediction_dict_GPP256.json', 'r') as fp:
    prediction_dict = json.load(fp)
    
val_df = parse_data('val')



  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# List possible evaluation metrics
from datasets import list_metrics

metrics_list = list_metrics()
len(metrics_list)
print(metrics_list)

['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'chrf', 'code_eval', 'comet', 'competition_math', 'coval', 'cuad', 'exact_match', 'f1', 'frugalscore', 'glue', 'google_bleu', 'indic_glue', 'mae', 'mahalanobis', 'matthews_correlation', 'mauve', 'mean_iou', 'meteor', 'mse', 'pearsonr', 'perplexity', 'poseval', 'precision', 'recall', 'rl_reliability', 'roc_auc', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'ter', 'trec_eval', 'wer', 'wiki_split', 'xnli', 'xtreme_s', 'angelina-wang/directional_bias_amplification', 'codeparrot/apps_metric', 'cpllab/syntaxgym', 'daiyizheng/valid', 'erntkn/dice_coefficient', 'hack/test_metric', 'jordyvl/ece', 'kaggle/ai4code', 'kaggle/amex', 'loubnabnl/apps_metric2', 'lvwerra/bary_score', 'lvwerra/test', 'mathemakitten/harness_sentiment', 'mfumanelli/geometric_mean', 'mgfrantz/roc_auc_macro', 'yzha/ctc_eval']


In [8]:
def score_preds(reference_dict, prediction_dict, val_df):
    '''
    Returns the results of metrics tests on the entire prediction set and scores each individual prediction.
    
    The output is a pandas DataFrame with line-by-line scores and a dictionary with set scores.
    '''
        
    # Load metrics
    metrics = evaluate.combine(['bleu', 'sacrebleu', 'meteor', 'rouge'])
    
    # Predictions and baseline
    refs = [reference_dict['values'][x]['target'][0] for x in range(0,len(reference_dict['values']))]
    preds = [prediction_dict['values'][x]['generated'].split('\t')[0] for x in range(0,len(prediction_dict['values']))]
    answers = val_df['answer_text'].to_list()
    context = val_df['context'].to_list()
    
    # Evaluate metrics
    scores = [metrics.compute(predictions = [preds[x]], references= [refs[x]]) for x in range(0,len(reference_dict['values']))]
    bleus = [x['bleu'] for x in scores]
    sacrebleus = [x['score'] for x in scores]
    meteors = [x['meteor'] for x in scores]
    rouges = [x['rougeL'] for x in scores]
    
    # Contruct dataframe with results
    df = pd.DataFrame(list(zip(refs,preds,answers,context, bleus, sacrebleus, meteors, rouges)
        ), columns = ['Reference', 'Prediction', 'Answer', 'Context', 'BLEU', 'SacreBLEU', 'METEOR', 'ROUGE'])
    df['Answer_Contamination'] = df.apply(lambda x: str(x['Answer']) in str(x['Prediction']), axis=1)
    
    # Get results for enitre set.
    refs = [[reference_dict['values'][x]['target'][0]] for x in range(0,len(reference_dict['values']))]
    preds = [[prediction_dict['values'][x]['generated'].split('\t')[0]] for x in range(0,len(prediction_dict['values']))]
    for ref, pred in zip(refs, preds):
        metrics.add_batch(references=ref, predictions=pred)
    results = metrics.compute()                          
                       
    return df, results

In [9]:
def score_set(reference_dict, prediction_dict, val_df):
    '''
    Returns the results of metrics tests on the entire prediction set.
    
    Does not score individual predictions.
    '''
    
    # Load metrics
    metrics = evaluate.combine(['bleu', 'sacrebleu', 'meteor', 'rouge'])
    
    # Get results for enitre set.
    refs = [[reference_dict['values'][x]['target'][0]] for x in range(0,len(reference_dict['values']))]
    preds = [[prediction_dict['values'][x]['generated'].split('\t')[0]] for x in range(0,len(prediction_dict['values']))]
    for ref, pred in zip(refs, preds):
        metrics.add_batch(references=ref, predictions=pred)
    results = metrics.compute()                          
                       
    return results

In [None]:
# Call function to detailed scores.
df, results = score_preds(reference_dict, prediction_dict, val_df)
results

In [None]:
# Save evaluation
df.to_pickle('df_baseline256.pkl')

In [19]:
# Call function to get top-level scores.
set_results = score_set(reference_dict, prediction_dict, val_df)
set_results

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'bleu': 0.1656587608025032,
 'bleu_precisions': [0.4406215316315205,
  0.19621654987110126,
  0.11763760869128663,
  0.07404744787922359],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0063682587313496,
 'translation_length': 60367,
 'reference_length': 59985,
 'score': 16.56587608025032,
 'counts': [26599, 10808, 5858, 3296],
 'totals': [60367, 55082, 49797, 44512],
 'sacrebleu_precisions': [44.062153163152054,
  19.621654987110126,
  11.763760869128662,
  7.404744787922358],
 'bp': 1.0,
 'sys_len': 60367,
 'ref_len': 59985,
 'meteor': 0.4143196855920803,
 'rouge1': 0.4356071411053506,
 'rouge2': 0.22499535880886862,
 'rougeL': 0.4018497255426625,
 'rougeLsum': 0.4021833461370723}

In [None]:
with open('reference_dict_base256.json', 'r') as fp:
    reference_dict = json.load(fp)
with open('prediction_dict_base256.json', 'r') as fp:
    prediction_dict = json.load(fp)

In [24]:
# Run evaluation on multiple prediction sets and save the results.
test_df = parse_data('test')
pred_sets = [
    'FINAL_prediction_dict_GPP256.json',
    'FINAL_NS_prediction_dict_GPP256.json'
]
ref_sets = [
    'FINAL_reference_dict_GPP256.json',  
    'FINAL_NS_reference_dict_GPP256.json'
]
save_names = [
    'df_GPP256.pkl',
    'df_GPP256_NS.pkl'
]

for x in range(0,len(pred_sets)):
    with open(ref_sets[x], 'r') as fp:
        reference_dict = json.load(fp)
    with open(pred_sets[x], 'r') as fp:
        prediction_dict = json.load(fp)
        
    df, results = score_preds(reference_dict, prediction_dict, test_df)
    print(save_names[x], results)
    
    df.to_pickle(save_names[x])



  0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


df_GPP256.pkl {'bleu': 0.1656587608025032, 'bleu_precisions': [0.4406215316315205, 0.19621654987110126, 0.11763760869128663, 0.07404744787922359], 'brevity_penalty': 1.0, 'length_ratio': 1.0063682587313496, 'translation_length': 60367, 'reference_length': 59985, 'score': 16.56587608025032, 'counts': [26599, 10808, 5858, 3296], 'totals': [60367, 55082, 49797, 44512], 'sacrebleu_precisions': [44.062153163152054, 19.621654987110126, 11.763760869128662, 7.404744787922358], 'bp': 1.0, 'sys_len': 60367, 'ref_len': 59985, 'meteor': 0.4143196855920803, 'rouge1': 0.4356071411053506, 'rouge2': 0.22499535880886862, 'rougeL': 0.4018497255426625, 'rougeLsum': 0.4021833461370723}


[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


df_GPP256_NS.pkl {'bleu': 0.1364246031118634, 'bleu_precisions': [0.43699918088251005, 0.17553515617321566, 0.099719224357287, 0.05947448080788031], 'brevity_penalty': 0.9341231345067288, 'length_ratio': 0.9362007168458781, 'translation_length': 56158, 'reference_length': 59985, 'score': 13.64246031118634, 'counts': [24541, 8930, 4546, 2397], 'totals': [56158, 50873, 45588, 40303], 'sacrebleu_precisions': [43.699918088251, 17.553515617321565, 9.9719224357287, 5.947448080788031], 'bp': 0.9341231345067288, 'sys_len': 56158, 'ref_len': 59985, 'meteor': 0.37423245714266146, 'rouge1': 0.40529660894059427, 'rouge2': 0.19167726547018954, 'rougeL': 0.3732356714376023, 'rougeLsum': 0.3735456607916215}


In [34]:
# Run evaluation on multiple prediction sets and save the results.
test_df = parse_data('test')
pred_sets = [
    'FINAL_prediction_dict_base256.json',
    'FINAL_NS_prediction_dict_base256.json',
    'FINAL_prediction_dict_GPP256.json',
    'FINAL_NS_prediction_dict_GPP256.json'
]
ref_sets = [
    'FINAL_reference_dict_base256.json',  
    'FINAL_NS_reference_dict_base256.json',
    'FINAL_reference_dict_GPP256.json',  
    'FINAL_NS_reference_dict_GPP256.json'
]

model_grades = []

for x in range(0,len(pred_sets)):
    with open(ref_sets[x], 'r') as fp:
        reference_dict = json.load(fp)
    with open(pred_sets[x], 'r') as fp:
        prediction_dict = json.load(fp)
        
    model_grades.append(score_set(reference_dict, prediction_dict, test_df))
    
model_grades
grades_df = pd.DataFrame(model_grades)
grades_df['model'] = ['Baseline (BS)', 'Baseline (NS)', 'GPP (BS)', 'GPP (NS)']
grades_df[['model','bleu','score','meteor','rougeL']].rename(columns = {'score': 'sacrebleu'})



  0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Download

Unnamed: 0,model,bleu,sacrebleu,meteor,rougeL
0,Baseline (BS),0.211274,21.127372,0.474482,0.462751
1,Baseline (NS),0.182588,18.258757,0.439014,0.442596
2,GPP (BS),0.165659,16.565876,0.41432,0.40185
3,GPP (NS),0.136425,13.64246,0.374232,0.373236
