# Using ROUGE and BERT scores to evaluate model's generated summaries

In [1]:
! pip --quiet install transformers datasets evaluate rouge_score torch bert_score py7zr

In [2]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
from huggingface_hub import login, logout

login("<>", add_to_git_credential=True)

In [None]:
from datasets import load_dataset
old_books_data_train = load_dataset("psin/old_books_data_train")
old_books_data_test = load_dataset("psin/old_books_data_test")
new_books_data_train = load_dataset("psin/new_books_data_train")
new_books_data_test = load_dataset("psin/new_books_data_test")


In [11]:
all_modern_text = [text for text in new_books_data_test['train']['text']]
all_modern_summaries = [text for text in new_books_data_test['train']['summary']]

In [None]:
import evaluate
import numpy as np
from datasets import load_metric

rouge = evaluate.load("rouge")
bertscore = load_metric("bertscore")

In [13]:
def compute_metrics(eval_pred, decoded=False):
    predictions, labels = eval_pred
    if decoded:
       decoded_preds = predictions
       decoded_labels = labels
    else:
      decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    
    # paper reccomended rescaling baseline 
    bert_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en", rescale_with_baseline=True)
    bert_result_final = bert_result
    print(bert_result.keys())
    bert_result_final['precision'] = np.round(np.mean(bert_result['precision']), 4)
    bert_result_final['recall'] = np.round(np.mean(bert_result['recall']), 4)
    bert_result_final['f1'] = np.round(np.mean(bert_result['f1']))

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_result["gen_len"] = np.mean(prediction_lens)
    rouge_result_final = {k: round(v, 4) for k, v in rouge_result.items()}
    
    final_result = rouge_result_final | bert_result_final
    return final_result

In [23]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

def get_prediction(text, model, tokenizer, n_gram_block=4, num_beams=3):
  tokenized_inputs = tokenizer(text, max_length=512, return_tensors="pt").input_ids
  output = model.generate(tokenized_inputs, max_new_tokens=200, do_sample=False, no_repeat_ngram_size=n_gram_block, num_beams=num_beams)
  return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
print(all_modern_text[:3])
for checkpoint in ['psin/xsum_and_billsum_and_samsum_old']:
  print(f"Testing model: {checkpoint}")
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
  predicted_text_list = []
  for text in all_modern_text:
    pred = get_prediction(text, model, tokenizer, n_gram_block=4, num_beams=3)
    predicted_text_list.append(pred)
  print(compute_metrics((predicted_text_list, all_modern_summaries), decoded=True))