#BART model running for both BART(pre-trained) and BART-cnn(pre-tuned) and evaluating on CNN/DailyMail test dataset.

In [None]:
!pip install transformers datasets rouge-score nltk bert-score

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from

In [None]:
#importing libraries
import torch
from datasets import load_dataset, load_metric
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score as bert_score

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#loading the dataset
dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
#initialize models and tokenizers
models = {
    "BART-Large": BartForConditionalGeneration.from_pretrained("facebook/bart-large"),
    "BART-CNN": BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn"),
}
tokenizers = {
    "BART-Large": BartTokenizer.from_pretrained("facebook/bart-large"),
    "BART-CNN": BartTokenizer.from_pretrained("facebook/bart-large-cnn"),
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#moving models to device
for model in models.values():
    model.to(device)

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
#initialize metrics
rouge = load_metric('rouge', trust_remote_code=True)
meteor = load_metric('meteor', trust_remote_code=True)
#batch size based on available memory, used A100 below.
batch_size = 16
results = []

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
#process in batches for each model
for model_name in models.keys():
    reference_summaries=[]
    generated_summaries=[]
    model = models[model_name]
    tokenizer = tokenizers[model_name]
    print(f"Processing {model_name}")
    #processing in batches
    for i in tqdm(range(0, len(dataset), batch_size), desc=f"Processing Test Set with {model_name}"):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
        articles = batch['article']
        references = batch['highlights']

        # Tokenize and encode the articles
        inputs = tokenizer(articles, max_length=512, return_tensors='pt', truncation=True, padding=True)
        inputs = inputs.to(device)

        # Generate summaries using the model
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=142, early_stopping=True)
        generated_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

        # Storing reference and generated summaries
        reference_summaries.extend([nltk.word_tokenize(ref.lower()) for ref in references])
        generated_summaries.extend([nltk.word_tokenize(gen.lower()) for gen in generated_batch])

        # UpdatingROUGE and METEOR
        rouge.add_batch(predictions=generated_batch, references=references)
        meteor.add_batch(predictions=generated_batch, references=references)

    #BLEU score
    bleu_score = corpus_bleu([[ref] for ref in reference_summaries], generated_summaries, smoothing_function=SmoothingFunction().method7)
    #ROUGEScore
    rouge_result = rouge.compute()
    #METEORScore
    meteor_result = meteor.compute()
    #cosinesimilarity
    vectorizer = TfidfVectorizer().fit(references + generated_batch)
    reference_vectors = vectorizer.transform(references)
    generated_vectors = vectorizer.transform(generated_batch)
    cosine_scores = cosine_similarity(reference_vectors, generated_vectors).diagonal().mean()
    #BERTScore
    P, R, F1 = bert_score(generated_batch, references, lang='en', verbose=True)
    bert_score_f1 = F1.mean().item()
    #results
    results.append({
        "Model": model_name,
        "BLEU": bleu_score,
        "ROUGE-1": rouge_result['rouge1'].mid.fmeasure,
        "ROUGE-2": rouge_result['rouge2'].mid.fmeasure,
        "ROUGE-L": rouge_result['rougeL'].mid.fmeasure,
        "METEOR": meteor_result['meteor'],
        "Cosine Similarity": cosine_scores,
        "BERTScore": bert_score_f1
    })

Processing BART-Large


Processing Test Set with BART-Large: 100%|██████████| 719/719 [1:12:24<00:00,  6.04s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.41 seconds, 4.84 sentences/sec
Processing BART-CNN


Processing Test Set with BART-CNN: 100%|██████████| 719/719 [50:29<00:00,  4.21s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.07 seconds, 28.41 sentences/sec


In [None]:
# Converting the results to a DataFrame
df = pd.DataFrame(results)
print(df)

        Model      BLEU   ROUGE-1   ROUGE-2   ROUGE-L    METEOR  \
0  BART-Large  0.171243  0.357737  0.158486  0.224820  0.405350   
1    BART-CNN  0.275657  0.423971  0.203607  0.298136  0.392444   

   Cosine Similarity  BERTScore  
0           0.439498   0.859324  
1           0.439810   0.880669  


        Model      BLEU   ROUGE-1   ROUGE-2   ROUGE-L    METEOR  \
0  BART-Large  0.171243  0.357737  0.158486  0.224820  0.405350   
1    BART-CNN  0.275657  0.423971  0.203607  0.298136  0.392444   

   Cosine Similarity  BERTScore  
0           0.439498   0.859324  
1           0.439810   0.880669

In [None]:
df

Unnamed: 0,Model,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,Cosine Similarity,BERTScore
0,BART-Large,0.171243,0.357737,0.158486,0.22482,0.40535,0.439498,0.859324
1,BART-CNN,0.275657,0.423971,0.203607,0.298136,0.392444,0.43981,0.880669
