In [1]:
!pip install transformers[torch] datasets==2.10.0 evaluate rouge_score -q
!pip install git+https://github.com/google-research/bleurt.git -q
!pip install bert_score -q
!pip install demoji -q
!pip install py7zr -q

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import torch
import demoji
from transformers import pipeline

In [4]:
dataset = load_dataset("samsum")



  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def preprocess(examples):
  processed = []
  out = {}
  # print(examples)
  for k, v in examples.items():
    if k!="id":
      for x in v:
        processed.append(demoji.replace(x.replace("\r","").replace("\n"," "), ''))
      out[k] = processed
      processed=[]
  return out

test_dataset = dataset['test'].map(preprocess, batched=True)



In [6]:
rouge = evaluate.load("rouge")
bleu = evaluate.load('bleu')
bleurt = evaluate.load('bleurt', module_type='metric', checkpoint='bleurt-base-512')
bert = evaluate.load('bertscore', lang="en", idf=True, model_type="bert-base-uncased", rescale_with_baseline=True)



FLAN T5 evaluation on test split

In [7]:
checkpoint = "sentientconch/flant5_sum_samsum"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [8]:
summarizer = pipeline("summarization", device=0, batch_size=8, model=model, tokenizer=tokenizer, min_length=8, max_length=24)
res=summarizer([x['dialogue'] for x in test_dataset])

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


In [9]:
preds=[x['summary_text'] for x in res]
refs=[x['summary'] for x in test_dataset]

In [10]:
bert_metric = bert.compute(references=refs, predictions=preds, lang="en", model_type="bert-base-uncased")
bert_score = {k:np.mean(np.array(v, dtype=np.float32)) for k,v in bert_metric.items() if k in ["precision","recall","f1"]}
bleu_score = bleu.compute(references=refs, predictions=preds)
rouge_score = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
bleurt_score = np.median(bleurt.compute(predictions=preds, references=refs)['scores'])

In [11]:
print(f"ROUGE: {rouge_score} \nBLEU: {bleu_score}\nBLEURT: {bleurt_score}\nBERT: {bert_score}")

ROUGE: {'rouge1': 0.4849741044952066, 'rouge2': 0.23956116473069355, 'rougeL': 0.401948142655437, 'rougeLsum': 0.4019744500187624} 
BLEU: {'bleu': 0.14764139201351698, 'precisions': [0.5583608318624818, 0.27178185391418275, 0.1647502852628807, 0.10128617363344052], 'brevity_penalty': 0.6581580470530669, 'length_ratio': 0.7050643869711071, 'translation_length': 13031, 'reference_length': 18482}
BLEURT: -0.43030059337615967
BERT: {'precision': 0.7090861, 'recall': 0.6648139, 'f1': 0.6831371}


BART evaluation on test split

In [12]:
checkpoint = "sentientconch/bart_sum_samsum"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

In [13]:
summarizer = pipeline("summarization", device=0, batch_size=8, model=model, tokenizer=tokenizer, min_length=8, max_length=24)
res=summarizer([x['dialogue'] for x in test_dataset])

In [14]:
preds=[x['summary_text'] for x in res]
refs=[x['summary'] for x in test_dataset]

In [15]:
bert_metric = bert.compute(references=refs, predictions=preds, lang="en", model_type="bert-base-uncased")
bert_score = {k:np.mean(np.array(v, dtype=np.float32)) for k,v in bert_metric.items() if k in ["precision","recall","f1"]}
bleu_score = bleu.compute(references=refs, predictions=preds)
rouge_score = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
bleurt_score = np.median(bleurt.compute(predictions=preds, references=refs)['scores'])

In [16]:
print(f"ROUGE: {rouge_score} \nBLEU: {bleu_score}\nBLEURT: {bleurt_score}\nBERT: {bert_score}")

ROUGE: {'rouge1': 0.4798055647613929, 'rouge2': 0.23596643667519668, 'rougeL': 0.3937004090798598, 'rougeLsum': 0.393820378570265} 
BLEU: {'bleu': 0.1343294199246959, 'precisions': [0.5420371236936515, 0.2536032658502041, 0.14949928469241774, 0.09261939218523878], 'brevity_penalty': 0.6431168826136296, 'length_ratio': 0.693756087003571, 'translation_length': 12822, 'reference_length': 18482}
BLEURT: -0.44436442852020264
BERT: {'precision': 0.70368916, 'recall': 0.65920734, 'f1': 0.6773743}
