DialogSum BERT Model Evaluation

In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install -U sentencepiece
!pip install -U urllib3
!pip install -U py7zr

Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.0
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
 

In [None]:
'''from google.colab import drive
drive.mount('/content/drive')'''

Mounted at /content/drive


In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("knkarthick/dialogsum", split='test',trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1500
})

In [None]:
!pip install bert-score
!pip install rouge-score

import torch
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_score import score as bert_score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=051dec027223611e39b159c845f81db1606668d348580f66aaec16ff5501ed26
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
#initializing for BERT and BERTSUM models and its tokenizers
models = {
    "BERT (Basic Encoder-Decoder)": EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased"),
    "BERTSUM": AutoModelForSeq2SeqLM.from_pretrained("Shobhank-iiitdwd/BERT_summary"),
}

tokenizers = {
    "BERT (Basic Encoder-Decoder)": BertTokenizer.from_pretrained("bert-base-uncased"),
    "BERTSUM": AutoTokenizer.from_pretrained("Shobhank-iiitdwd/BERT_summary"),
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
for model in models.values():
    model.to(device)

#initializing the evaluation metrics
rouge = load_metric('rouge', trust_remote_code=True)
meteor = load_metric('meteor', trust_remote_code=True)


batch_size = 4 # based on the GPU-RAM if available on runtime 3, using A100 GPU, sufficient memmory was there to support this batch size=16configuration
results = []

# batch wise processing of each model
for model_name in models.keys():
    reference_summaries = []
    generated_summaries = []

    model = models[model_name]
    tokenizer = tokenizers[model_name]

    print(f"Processing {model_name}")

    decoder_start_token_id = tokenizer.pad_token_id

    #  batch processing
    for i in tqdm(range(0, len(dataset), batch_size), desc=f"Processing Test Set with {model_name}"):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))

        # accessing the articles and references directly from the batch dictionary
        articles = batch['dialogue']
        references = batch['summary']

        # Tokenizing and encoding the articles with left padding
        inputs = tokenizer(
            articles,
            max_length=512,
            return_tensors='pt',
            truncation=True,
            padding='longest',
            pad_to_multiple_of=512
        )
        inputs = inputs.to(device)

        # Generating  summaries using the model
        summary_ids = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],  # Attention mask for proper decoding as BERT faces has encoder only arc.
            decoder_start_token_id=decoder_start_token_id,
            num_beams=4,
            max_length=142,
            early_stopping=True
        )
        generated_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

        #reference and generated summaries stored
        reference_summaries.extend([nltk.word_tokenize(ref.lower()) for ref in references])
        generated_summaries.extend([nltk.word_tokenize(gen.lower()) for gen in generated_batch])

        rouge.add_batch(predictions=generated_batch, references=references)
        meteor.add_batch(predictions=generated_batch, references=references)

    # BLEUscore
    bleu_score = corpus_bleu([[ref] for ref in reference_summaries], generated_summaries, smoothing_function=SmoothingFunction().method7)
    # ROUGEscore
    rouge_result = rouge.compute()
    # METEORscore
    meteor_result = meteor.compute()
    # Cosinesimilarity
    vectorizer = TfidfVectorizer().fit(references + generated_batch)
    reference_vectors = vectorizer.transform(references)
    generated_vectors = vectorizer.transform(generated_batch)
    cosine_scores = cosine_similarity(reference_vectors, generated_vectors).diagonal().mean()
    # BERTScore
    P, R, F1 = bert_score(generated_batch, references, lang='en', verbose=True)
    bert_score_f1 = F1.mean().item()

    results.append({
        "Model": model_name,
        "BLEU": bleu_score,
        "ROUGE-1": rouge_result['rouge1'].mid.fmeasure,
        "ROUGE-2": rouge_result['rouge2'].mid.fmeasure,
        "ROUGE-L": rouge_result['rougeL'].mid.fmeasure,
        "METEOR": meteor_result['meteor'],
        "Cosine Similarity": cosine_scores,
        "BERTScore": bert_score_f1
    })

# DF from the results
df = pd.DataFrame(results)
print(df)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

cuda


  rouge = load_metric('rouge', trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Processing BERT (Basic Encoder-Decoder)


Processing Test Set with BERT (Basic Encoder-Decoder): 100%|██████████| 375/375 [22:45<00:00,  3.64s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.40 seconds, 10.03 sentences/sec
Processing BERTSUM


Processing Test Set with BERTSUM: 100%|██████████| 375/375 [13:01<00:00,  2.08s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.15 seconds, 27.00 sentences/sec
                          Model      BLEU   ROUGE-1   ROUGE-2   ROUGE-L  \
0  BERT (Basic Encoder-Decoder)  0.065428  0.010812  0.000000  0.010849   
1                       BERTSUM  0.098049  0.189944  0.053992  0.140681   

     METEOR  Cosine Similarity  BERTScore  
0  0.019318           0.000000   0.693550  
1  0.201462           0.130055   0.842697  


In [None]:
df

Unnamed: 0,Model,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,Cosine Similarity,BERTScore
0,BERT (Basic Encoder-Decoder),0.065428,0.010812,0.0,0.010849,0.019318,0.0,0.69355
1,BERTSUM,0.098049,0.189944,0.053992,0.140681,0.201462,0.130055,0.842697
