### Model 1: BlenderBot model evaluation

In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BlenderbotTokenizer,
    BlenderbotForConditionalGeneration,
    DataCollatorForSeq2Seq,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from datasets import DatasetDict, Dataset, load_dataset
import numpy as np
import pandas as pd
import random
import evaluate

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [5]:
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [None]:
def test_loop(model, tokenizer, test_dataset, data_collator):
    dataloader = DataLoader(test_dataset, batch_size=4, collate_fn=data_collator)
    model.eval()
    # model.to(device)

    all_preds = []
    all_labels = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
                num_beams=2,
            )

        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        labels = batch["labels"]
        labels_with_pad = torch.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels_with_pad, skip_special_tokens=True)

        all_preds.extend(preds)
        all_labels.extend(decoded_labels)

    # ROUGE
    rouge_score = rouge.compute(predictions=all_preds, references=all_labels, use_stemmer=True)

    # BLEU
    bleu_score = sacrebleu.compute(predictions=all_preds, references=[[ref] for ref in all_labels])

    # BERTScore
    bert_score = bertscore.compute(predictions=all_preds, references=all_labels, lang="en")
    bertscore_precision = np.mean(bert_score["precision"])
    bertscore_recall = np.mean(bert_score["recall"])
    bertscore_f1 = np.mean(bert_score["f1"])

    return {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "bleu": bleu_score["score"],
        "bertscore_precision": bertscore_precision,
        "bertscore_recall": bertscore_recall,
        "bertscore_f1": bertscore_f1,
    }

In [8]:
dataset = load_dataset("facebook/empathetic_dialogues")

In [10]:
from datasets import load_from_disk

In [11]:
tokenized_dataset = load_from_disk("./Chatbot Training/blender_tokenized_dataset")

In [12]:
model_path = "./Chatbot Training/blender_empathetic_final/"
tokenizer_final = BlenderbotTokenizer.from_pretrained(model_path)
model_final = BlenderbotForConditionalGeneration.from_pretrained(model_path)

In [13]:
device

device(type='mps')

In [14]:
model_final.to(device)

BlenderbotForConditionalGeneration(
  (model): BlenderbotModel(
    (shared): BlenderbotScaledWordEmbedding(8041, 1280, padding_idx=0)
    (encoder): BlenderbotEncoder(
      (embed_tokens): BlenderbotScaledWordEmbedding(8041, 1280, padding_idx=0)
      (embed_positions): BlenderbotLearnedPositionalEmbedding(128, 1280)
      (layers): ModuleList(
        (0-1): 2 x BlenderbotEncoderLayer(
          (self_attn): BlenderbotAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer_final, model=model_final)

In [16]:
test_results = test_loop(model_final, tokenizer_final,tokenized_dataset['test'], data_collator)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
test_results

{'rouge1': np.float64(0.1970258452150462),
 'rouge2': np.float64(0.048995485008865955),
 'rougeL': np.float64(0.1729539956933947),
 'bleu': 2.6440061779484996,
 'bertscore_precision': np.float64(0.8707038615839667),
 'bertscore_recall': np.float64(0.8642908634422989),
 'bertscore_f1': np.float64(0.8673355983103836)}

In [20]:
print("Rouge 1 score is", test_results['rouge1'] * 100)
print("Rouge 2 score is", test_results['rouge2'] * 100)
print("Rouge L score is", test_results['rougeL'] * 100)
print("BLEU score is", test_results['bleu'])
print("Bertscore Precision is", test_results['bertscore_precision'] * 100)
print("Bertscore Recall is", test_results['bertscore_recall'] * 100)
print("Bertscore F1 is", test_results['bertscore_f1'] * 100)

Rouge 1 score is 19.702584521504622
Rouge 2 score is 4.8995485008865955
Rouge L score is 17.29539956933947
BLEU score is 2.6440061779484996
Bertscore Precision is 87.07038615839667
Bertscore Recall is 86.42908634422989
Bertscore F1 is 86.73355983103836
