Evaluation of T5- small and T5--base done on CNN/DailyMail

In [None]:
import torch
from datasets import load_dataset, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import pandas as pd

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

#initializing models and tokenizers for T5-small and T5-base modekls
models = {
    "T5-small": T5ForConditionalGeneration.from_pretrained("t5-small"),
    "T5-base": T5ForConditionalGeneration.from_pretrained("t5-base")
}

tokenizers = {
    "T5-small": T5Tokenizer.from_pretrained("t5-small"),
    "T5-base": T5Tokenizer.from_pretrained("t5-base")
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for model in models.values():
    model.to(device)

In [None]:
rouge = load_metric('rouge', trust_remote_code=True)
meteor = load_metric('meteor', trust_remote_code=True)
#batch size adjusted in accordance with runtime
batch_size = 16
results = []

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
for model_name in models.keys():
    reference_summaries= []
    generated_summaries= []

    model = models[model_name]
    tokenizer = tokenizers[model_name]

    print(f"Processing {model_name}")

    # Batch processing
    for i in tqdm(range(0, len(dataset), batch_size), desc=f"Processing Test Set with {model_name}"):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))

        #accessing the articles and references directly from the batch dictionary
        articles = batch['article']
        references = batch['highlights']

        #tokenizing and encoding of the articles
        inputs = tokenizer(articles, max_length=512, return_tensors='pt', truncation=True, padding=True)
        inputs = inputs.to(device)

        #generating summaries using T5
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=142, early_stopping=True)
        generated_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

        reference_summaries.extend([nltk.word_tokenize(ref.lower()) for ref in references])
        generated_summaries.extend([nltk.word_tokenize(gen.lower()) for gen in generated_batch])

        rouge.add_batch(predictions=generated_batch, references=references)
        meteor.add_batch(predictions=generated_batch, references=references)

    #BLEUscore
    bleu_score = corpus_bleu([[ref] for ref in reference_summaries], generated_summaries, smoothing_function=SmoothingFunction().method7)
    #ROUGE Score
    rouge_result = rouge.compute()
    #METEOR Score
    meteor_result = meteor.compute()

    results.append({
        "Model": model_name,
        "BLEU": bleu_score,
        "ROUGE-1": rouge_result['rouge1'].mid.fmeasure,
        "ROUGE-2": rouge_result['rouge2'].mid.fmeasure,
        "ROUGE-L": rouge_result['rougeL'].mid.fmeasure,
        "METEOR": meteor_result['meteor']
    })


Processing T5-small


Processing Test Set with T5-small: 100%|██████████| 719/719 [32:30<00:00,  2.71s/it]


Processing T5-base


Processing Test Set with T5-base: 100%|██████████| 719/719 [52:14<00:00,  4.36s/it]


In [None]:
df = pd.DataFrame(results)
print(df)

      Model      BLEU   ROUGE-1   ROUGE-2   ROUGE-L    METEOR
0  T5-small  0.214529  0.359605  0.155545  0.249217  0.308347
1   T5-base  0.222824  0.379325  0.169113  0.262846  0.330065


In [None]:
pip install bert-score



In [None]:
from bert_score import score as bertscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# If tthe generated_summaries and reference_summaries are tokenized (list of lists), converting them back to strings
generated_summaries = [" ".join(summary) for summary in generated_summaries]
reference_summaries = [" ".join(summary) for summary in reference_summaries]

# BERT_Score calculation
P, R, F1 = bertscore(generated_summaries, reference_summaries, lang="en", rescale_with_baseline=True)

# cosine similarity calculation
def calculate_cosine_similarity(generated_summaries, reference_summaries):
    vectorizer = TfidfVectorizer().fit_transform(generated_summaries + reference_summaries)
    vectors = vectorizer.toarray()

    cosine_similarities = []
    num_summaries = len(generated_summaries)

    for i in range(num_summaries):
        cosine_sim = cosine_similarity([vectors[i]], [vectors[num_summaries + i]])[0][0]
        cosine_similarities.append(cosine_sim)

    return np.mean(cosine_similarities)

cosine_sim = calculate_cosine_similarity(generated_summaries, reference_summaries)

#results
bertscore_results = {
    "BERTScore-P": np.mean(P.numpy()),
    "BERTScore-R": np.mean(R.numpy()),
    "BERTScore-F1": np.mean(F1.numpy()),
    "Cosine Similarity": cosine_sim
}

print(bertscore_results)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BERTScore-P': 0.24729992, 'BERTScore-R': 0.23538211, 'BERTScore-F1': 0.24161659, 'Cosine Similarity': 0.3853122305489106}


In [None]:
#the df along with BLEU, ROUGE, and METEOR scores is added to the above existing df.

df['BERTScore-P'] = bertscore_results['BERTScore-P']
df['BERTScore-R'] = bertscore_results['BERTScore-R']
df['BERTScore-F1'] = bertscore_results['BERTScore-F1']
df['Cosine Similarity'] = bertscore_results['Cosine Similarity']

print(df)


      Model      BLEU   ROUGE-1   ROUGE-2   ROUGE-L    METEOR  BERTScore-P  \
0  T5-small  0.214529  0.359605  0.155545  0.249217  0.308347       0.2473   
1   T5-base  0.222824  0.379325  0.169113  0.262846  0.330065       0.2473   

   BERTScore-R  BERTScore-F1  Cosine Similarity  
0     0.235382      0.241617           0.385312  
1     0.235382      0.241617           0.385312  


In [None]:
df

Unnamed: 0,Model,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,BERTScore-P,BERTScore-R,BERTScore-F1,Cosine Similarity
0,T5-small,0.214529,0.359605,0.155545,0.249217,0.308347,0.2473,0.235382,0.241617,0.385312
1,T5-base,0.222824,0.379325,0.169113,0.262846,0.330065,0.2473,0.235382,0.241617,0.385312


In [None]:
import platform
import psutil
import torch
import os
import sys
import subprocess

# System Info
system = platform.system()
release = platform.release()
version = platform.version()
architecture = platform.architecture()[0]
processor = platform.processor()
python_version = platform.python_version()
machine = platform.machine()
memory = psutil.virtual_memory()
cpu_count = psutil.cpu_count(logical=True)
kernel_version = os.uname()

# GPU Info
if torch.cuda.is_available():
    cuda_version = torch.version.cuda
    gpu_name = torch.cuda.get_device_name(0)
    gpu_count = torch.cuda.device_count()
    gpu_capability = torch.cuda.get_device_capability(0)
    gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # Converting to GB
else:
    cuda_version = "No CUDA available"
    gpu_name = "No GPU available"
    gpu_count = 0
    gpu_capability = "N/A"
    gpu_memory_total = "N/A"
try:
    nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
    nvcc_version = nvcc_output.split()[-2]
except:
    nvcc_version = "nvcc not installed"

#installed Packages
installed_packages = subprocess.check_output([sys.executable, "-m", "pip", "list"]).decode("utf-8")
print(f"System: {system}")
print(f"Release: {release}")
print(f"Version: {version}")
print(f"Kernel Version: {kernel_version.sysname} {kernel_version.release}")
print(f"Architecture: {architecture}")
print(f"Processor: {processor}")
print(f"Python Version: {python_version}")
print(f"Machine: {machine}")
print(f"Total Memory: {memory.total / (1024 ** 3):.2f} GB")
print(f"Available Memory: {memory.available / (1024 ** 3):.2f} GB")
print(f"CPU Count: {cpu_count}")
print(f"GPU Name: {gpu_name}")
print(f"Number of GPUs: {gpu_count}")
print(f"CUDA Version (PyTorch): {cuda_version}")
print(f"CUDA Version (nvcc): {nvcc_version}")
print(f"GPU Capabilities: {gpu_capability}")
print(f"GPU Memory Total: {gpu_memory_total:.2f} GB")
print("\nSample of Installed Packages:\n", installed_packages[:500])  # Limit output to first 500 characters


System: Linux
Release: 6.1.85+
Version: #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
Kernel Version: Linux 6.1.85+
Architecture: 64bit
Processor: x86_64
Python Version: 3.10.12
Machine: x86_64
Total Memory: 83.48 GB
Available Memory: 77.67 GB
CPU Count: 12
GPU Name: NVIDIA A100-SXM4-40GB
Number of GPUs: 1
CUDA Version (PyTorch): 12.1
CUDA Version (nvcc): Build
GPU Capabilities: (8, 0)
GPU Memory Total: 39.56 GB

Sample of Installed Packages:
 Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
accelerate                       0.32.1
aiohappyeyeballs                 2.3.5
aiohttp                          3.10.2
aiosignal                        1.3.1
alabaster                        0.7.16
albucore                         0.0.13
albumentations                   1.4.13
altair                           4.2.2
annotated-types                  0.7.0
anyio    
