In [17]:
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu
import torch
# Load fine-tuned model and tokenizer
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Generate predicted answer
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )
    # Generate
    device = model.device
    attention_mask = torch.ones_like(input_ids)  # Create mask with all 1s
    # Fix: Mask all padding tokens, including the first element
    attention_mask[input_ids == tokenizer.pad_token_id] = 0
    generated_tokens_with_prompt = model.generate(
        input_ids.to(device),
        max_length=max_output_tokens,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id  # Set pad token
    )
    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]
    return generated_text_answer


In [136]:
# Sample question and answer from evaluation dataset
test_question = "What is the capital of France?"
answer = "Paris"
predicted_answer = inference(test_question, model, tokenizer)
print(predicted_answer)



The French are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes


In [137]:
def exact_match(answer, predicted_answer):
  """
  This function calculates the exact match ratio between the answer and predicted answer.

  Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

  Returns:
      A float value (1.0 for exact match, 0.0 otherwise).
  """
  return 1.0 if answer.lower() == predicted_answer.lower() else 0.0

In [138]:
def bleu_score(answer, predicted_answer):
  """
  This function calculates a BLEU score between the answer and predicted answer using the `nltk` library with smoothing.

  Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

  Returns:
      A float value representing the BLEU score (higher is better).

  **Requires `nltk` library to be installed (`pip install nltk`).**
  """
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  reference = [answer.split()]
  candidate = predicted_answer.split()
  smooth = SmoothingFunction()  # Create a SmoothingFunction object
  return sentence_bleu(reference, candidate, smoothing_function=smooth.method0)  # Use method0 from SmoothingFunction

In [139]:
def rouge_n(answer, predicted_answer, n):
  """
  This function calculates ROUGE-N score (e.g., ROUGE-1, ROUGE-2) between the answer and predicted answer using the `datasets` library.

  Args:
    answer: The ground truth answer (string).
    predicted_answer: The predicted answer by the LLM (string).
    n: The n-gram size for the ROUGE metric (e.g., 1 for unigrams).

  Returns:
    A dictionary containing precision, recall, and F1 score for ROUGE-N.

  **Requires `datasets` library to be installed (`pip install datasets`).**
  """
  from datasets import load_metric
  rouge = load_metric("rouge")

  if n == 1:
    return rouge.compute(predictions=[predicted_answer], references=[[answer]], rouge_types=["rouge1"])
  elif n == 2:
    return rouge.compute(predictions=[predicted_answer], references=[[answer]], rouge_types=["rouge2"])
  # You can add similar logic for ROUGE-L or other variants
  else:
    raise ValueError("ROUGE-N not supported for n > 2. Choose n=1 or n=2.")



In [140]:
def f1_score_a(answer, predicted_answer):
    """
    This function calculates F1 score between the answer and predicted answer 

    Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

    Returns:
      A float value representing the F1 score (higher is better).
    """
    answer_tokens = set(answer.lower().split())
    predicted_tokens = set(predicted_answer.lower().split())

    # Calculate precision
    precision = len(answer_tokens.intersection(predicted_tokens)) / len(predicted_tokens)
    
    # Calculate recall
    recall = len(answer_tokens.intersection(predicted_tokens)) / len(answer_tokens)
    
    # Handle division by zero for precision or recall
    if precision + recall == 0:
        return 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score




In [141]:
def f1_score(answer, predicted_answer):
    """
    This function calculates F1 score between the answer and predicted answer 

    Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

    Returns:
      A float value representing the F1 score (higher is better).
    """
    answer_tokens = set(answer.lower().split())
    predicted_tokens = set(predicted_answer.lower().split())

    # Calculate precision
    precision = len(answer_tokens.intersection(predicted_tokens)) / len(predicted_tokens)
    
    # Calculate recall
    recall = len(answer_tokens.intersection(predicted_tokens)) / len(answer_tokens)
    
    # Handle division by zero for precision or recall
    if precision + recall == 0:
        return 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score




In [142]:
# Sample usage
test_question = "What is the capital of France?"
answer = "Paris"
predicted_answer = 'The French are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes.'
predicted_answer = "The capital is Paris"
# Calculate BLEU Score
bleu = sentence_bleu([answer.split()], predicted_answer.split())
# Calculate Exact Match
exact_match_value = int(predicted_answer.strip() == answer.strip())

print(f"Test Question: {test_question}")
print(f"Predicted Answer: {predicted_answer}")
print(f"Ground-Truth Answer: {answer}")
print(f"BLEU Score: {bleu}")
print("BLEU Score:", bleu_score(answer, predicted_answer))
print("ROUGE-1 Score:", rouge_n(answer, predicted_answer, 1))
# You can call rouge_n with n=2 for ROUGE-2 score
print("ROUGE-2 Score:", rouge_n(answer, predicted_answer, 2))
print("F1 Score:", f1_score_a(answer, predicted_answer))
print("F1 Score:", f1_score(answer, predicted_answer))
print("Exact Match Ratio:", exact_match(answer, predicted_answer))
print("exact_match_value",exact_match_value)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Test Question: What is the capital of France?
Predicted Answer: The capital is Paris
Ground-Truth Answer: Paris
BLEU Score: 1.2882297539194154e-231
BLEU Score: 1.2882297539194154e-231
ROUGE-1 Score: {'rouge1': AggregateScore(low=Score(precision=0.25, recall=1.0, fmeasure=0.4), mid=Score(precision=0.25, recall=1.0, fmeasure=0.4), high=Score(precision=0.25, recall=1.0, fmeasure=0.4))}
ROUGE-2 Score: {'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0))}
F1 Score: 0.4
F1 Score: 0.4
Exact Match Ratio: 0.0
exact_match_value 0
