In [None]:
!pip install -q nltk evaluate rouge-score bert_score transformers accelerate

In [None]:
# Imports
from transformers import AutoModelForCausalLM, AutoTokenizer
from rouge_score import rouge_scorer
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
import json
import nltk
import torch
from evaluate import load as load_metric

In [None]:
!pip uninstall bitsandbytes

In [None]:
!pip install -U bitsandbytes

In [None]:
import bitsandbytes

In [None]:
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
from huggingface_hub import login
login()

In [None]:
# torch.cuda.empty_cache()

model_a = AutoModelForCausalLM.from_pretrained(
    "MarsGray/Gemma-2b-it-merged-1.0",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer_a = AutoTokenizer.from_pretrained("MarsGray/Gemma-2b-it-merged-1.0")

model_b = AutoModelForCausalLM.from_pretrained(
    "MarsGray/Phi-4-mini-instruct-merged-1.2",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer_b = AutoTokenizer.from_pretrained("MarsGray/Phi-4-mini-instruct-merged-1.2")

model_c = AutoModelForCausalLM.from_pretrained(
    "GRojas98/qa-llama-finetuned",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer_c = AutoTokenizer.from_pretrained("GRojas98/qa-llama-finetuned")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
for tokenizer in [tokenizer_a, tokenizer_b, tokenizer_c]:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

In [None]:
# Define generation and evaluation functions
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleu_metric = load_metric("bleu")
meteor_metric = load_metric("meteor")
from bert_score import score as bert_score


def generate_response(model, tokenizer, prompt, max_new_tokens=256):
  """
  Generate a response from the model and tokenizer based on the prompt

  Args:
    model: The language model to generate response.
    tokenizer: The tokenizer for the corresponding model.
    prompt: The input instruction/question
    max_new_tokens: Max number of new tokens for generation

  Returns:
    str: The generated response
  """
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

def evaluate_model(model, tokenizer, dataset):
  """
  Evaluate the model on a dataset by generating responses and computing ROUGE scores.

  Args:
    model: The language model to evaluate.
    tokenizer: The tokenizer for the corresponding model.
    dataset: A list of dictionaries with "instruction" and "response" keys.

  Returns:
    tuple: A tuple containing:
        - list: A list of ROUGE score dictionaries for each example
        - list: A list of model predictions
        - list: A list of reference responses
  """
  scores = []
  predictions = []
  references = []
  for sample in tqdm(dataset):
    prompt = sample["instruction"]
    reference = sample["response"]

    prediction = generate_response(model, tokenizer, prompt)
    # prediction = prediction.strip().split("### Response:")[-1].strip()

    result = scorer.score(reference, prediction)
    scores.append(result)
    predictions.append(prediction)
    references.append(reference)
  return scores, predictions, references

def average_rouge_scores(scores):
  """
  Compute averae ROUGE scores from a list of score dictionaries.

  Args:
    scores (list): List of ROUGE score dictionaries.

  Return:
    dict: Dictionary of averaged ROUGE scores (f-measure).
  """
  avg_scores = defaultdict(float)
  for score in scores:
    for key in score:
      avg_scores[key] += score[key].fmeasure
  for key in avg_scores:
    avg_scores[key] /= len(scores)
  return dict(avg_scores)

def evaluate_with_bertscore(predictions, references, lang='en'):
    """
    Compute BERTScore between predicted and reference responses.

    Args:
        predictions (list of str): Model-generated responses.
        references (list of str): Ground truth answers.
        lang (str): Language code (default: 'en').

    Returns:
        dict: Average BERTScore precision, recall, and F1.
    """
    P, R, F1 = bert_score(predictions, references, lang=lang)
    return {
        "bert_precision": P.mean().item(),
        "bert_recall": R.mean().item(),
        "bert_f1": F1.mean().item()
    }

def compute_bleu_meteor(predictions, references):
    """
    Compute BLEU and METEOR scores.

    Args:
        predictions (list): List of generated responses.
        references (list): List of reference responses.

    Returns:
        dict: Dictionary with 'bleu' and 'meteor' scores.
    """
    bleu_score = bleu_metric.compute(
        predictions=predictions,
        references=[[ref] for ref in references]
    )["bleu"]

    meteor_score = meteor_metric.compute(
        predictions=predictions,
        references=references
    )["meteor"]

    return {
        "bleu": bleu_score,
        "meteor": meteor_score
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Load Evaluation Dataset from QA_fintuned.jsonl
file_path = "/content/gdrive/MyDrive/QA_finetuned.jsonl"
eval_set = []
with open(file_path, "r", encoding="utf-8") as f:
  for i, line in enumerate(f):
    if i >= 430:
      try:
        sample = json.loads(line.strip())
        if "instruction" in sample and "response" in sample:
          eval_set.append(sample)
      except json.JSONDecodeError:
        continue
print(f"Loaded {len(eval_set)} evaluation sample.")

Loaded 108 evaluation sample.


In [None]:
# Run Evaluations
scores_a, preds_a, refs_a = evaluate_model(model_a, tokenizer_a, eval_set)
scores_b, preds_b, refs_b = evaluate_model(model_b, tokenizer_b, eval_set)
scores_c, preds_c, refs_c = evaluate_model(model_c, tokenizer_c, eval_set)

avg_a = average_rouge_scores(scores_a)
avg_b = average_rouge_scores(scores_b)
avg_c = average_rouge_scores(scores_c)

bert_a = evaluate_with_bertscore(preds_a, refs_a)
bert_b = evaluate_with_bertscore(preds_b, refs_b)
bert_c = evaluate_with_bertscore(preds_c, refs_c)

100%|██████████| 108/108 [04:22<00:00,  2.43s/it]
100%|██████████| 108/108 [32:04<00:00, 17.82s/it]
  0%|          | 0/108 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/108 [00:12<21:30, 12.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/108 [00:23<20:49, 11.78s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/108 [00:35<20:29, 11.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▎         | 4/108 [00:45<19:34, 11.29s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▍         | 5/108 [00:53<16:57,  9.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▌         | 6/108 [01:05<17:52, 10.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▋         | 7/108 [01:16<18:18, 10.88s/it]Setting `pad_token_id

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
results_df = pd.DataFrame([
    {**avg_a, **bert_a},
    {**avg_b, **bert_b},
    {**avg_c, **bert_c}
], index=["Gemma", "Microsoft", "Llama"])

display(results_df)


Unnamed: 0,rouge1,rouge2,rougeL,bert_precision,bert_recall,bert_f1
Gemma,0.143823,0.102429,0.125795,0.759889,0.861523,0.807231
Microsoft,0.095097,0.069549,0.084658,0.749458,0.858103,0.799877
Llama,0.103615,0.075338,0.092852,0.749629,0.856993,0.799505


In [None]:
results_df.to_csv('/content/gdrive/MyDrive/GemmaVsMicrosoftVsLlamaAllMetrics.csv', index=True)