In [37]:
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM
from nltk.translate.bleu_score import sentence_bleu
import torch
# Load fine-tuned model and tokenizer
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Generate predicted answer
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )
    # Generate
    device = model.device
    attention_mask = torch.ones_like(input_ids)  # Create mask with all 1s
    # Fix: Mask all padding tokens, including the first element
    attention_mask[input_ids == tokenizer.pad_token_id] = 0
    generated_tokens_with_prompt = model.generate(
        input_ids.to(device),
        max_length=max_output_tokens,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id  # Set pad token
    )
    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]
    return generated_text_answer


In [38]:
# Sample question and answer from evaluation dataset
test_question = "What is the capital of France?"
answer = "Paris"
predicted_answer = inference(test_question, model, tokenizer)
print(predicted_answer)



The French are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes


In [39]:
def exact_match(answer, predicted_answer):
  """
  This function calculates the exact match ratio between the answer and predicted answer.

  Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

  Returns:
      A float value (1.0 for exact match, 0.0 otherwise).
  """
  return 1.0 if answer.lower() == predicted_answer.lower() else 0.0

In [40]:

def bleu_score(answer, predicted_answer):
  """
  This function calculates a BLEU score between the answer and predicted answer using the `nltk` library with smoothing.

  Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

  Returns:
      A float value representing the BLEU score (higher is better).

  **Requires `nltk` library to be installed (`pip install nltk`).**
  """
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  reference = [answer.split()]
  candidate = predicted_answer.split()
  smooth = SmoothingFunction()  # Create a SmoothingFunction object
  return sentence_bleu(reference, candidate, smoothing_function=smooth.method0)  # Use method0 from SmoothingFunction

In [41]:
def bleu_score(answer, predicted_answer, n=2):
    """
    This function calculates a BLEU score between the answer and predicted answer using the `nltk` library with smoothing.

    Args:
        answer: The ground truth answer (string).
        predicted_answer: The predicted answer by the LLM (string).
        n: The n-gram order (default is 2 for bigrams).

    Returns:
        A float value representing the BLEU score (higher is better).

    **Requires `nltk` library to be installed (`pip install nltk`).**
    """
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    reference = [answer.split()]
    candidate = predicted_answer.split()
    smooth = SmoothingFunction()  # Create a SmoothingFunction object
    return sentence_bleu(reference, candidate, smoothing_function=smooth.method1, weights=(1/n,)*n)


In [42]:
def rouge_n(answer, predicted_answer, n):
  """
  This function calculates ROUGE-N score (e.g., ROUGE-1, ROUGE-2) between the answer and predicted answer using the `datasets` library.

  Args:
    answer: The ground truth answer (string).
    predicted_answer: The predicted answer by the LLM (string).
    n: The n-gram size for the ROUGE metric (e.g., 1 for unigrams).

  Returns:
    A dictionary containing precision, recall, and F1 score for ROUGE-N.

  **Requires `datasets` library to be installed (`pip install datasets`).**
  """
  from datasets import load_metric
  rouge = load_metric("rouge")

  if n == 1:
    return rouge.compute(predictions=[predicted_answer], references=[[answer]], rouge_types=["rouge1"])
  elif n == 2:
    return rouge.compute(predictions=[predicted_answer], references=[[answer]], rouge_types=["rouge2"])
  # You can add similar logic for ROUGE-L or other variants
  else:
    raise ValueError("ROUGE-N not supported for n > 2. Choose n=1 or n=2.")



In [43]:
def f1_score_a(answer, predicted_answer):
    """
    This function calculates F1 score between the answer and predicted answer 

    Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

    Returns:
      A float value representing the F1 score (higher is better).
    """
    answer_tokens = set(answer.lower().split())
    predicted_tokens = set(predicted_answer.lower().split())

    # Calculate precision
    precision = len(answer_tokens.intersection(predicted_tokens)) / len(predicted_tokens)
    
    # Calculate recall
    recall = len(answer_tokens.intersection(predicted_tokens)) / len(answer_tokens)
    
    # Handle division by zero for precision or recall
    if precision + recall == 0:
        return 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score




In [44]:
def f1_score(answer, predicted_answer):
    """
    This function calculates F1 score between the answer and predicted answer 

    Args:
      answer: The ground truth answer (string).
      predicted_answer: The predicted answer by the LLM (string).

    Returns:
      A float value representing the F1 score (higher is better).
    """
    answer_tokens = set(answer.lower().split())
    predicted_tokens = set(predicted_answer.lower().split())

    # Calculate precision
    precision = len(answer_tokens.intersection(predicted_tokens)) / len(predicted_tokens)
    
    # Calculate recall
    recall = len(answer_tokens.intersection(predicted_tokens)) / len(answer_tokens)
    
    # Handle division by zero for precision or recall
    if precision + recall == 0:
        return 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score




In [None]:
from collections import Counter
import math

def cosine_similarity(str1, str2):
    """
    Computes the cosine similarity between two strings using the Bag-of-Words model.

    Args:
        str1: The first string.
        str2: The second string.

    Returns:
        A float representing the cosine similarity between the two strings.
    """
    # Tokenize the strings
    tokens1 = str1.split()
    tokens2 = str2.split()

    # Create bag of words for each string
    bow1 = Counter(tokens1)
    bow2 = Counter(tokens2)

    # Get the set of all unique words
    all_words = set(bow1.keys()).union(set(bow2.keys()))

    # Compute dot product
    dot_product = sum(bow1[word] * bow2[word] for word in all_words)

    # Compute magnitudes
    magnitude1 = math.sqrt(sum(bow1[word] ** 2 for word in all_words))
    magnitude2 = math.sqrt(sum(bow2[word] ** 2 for word in all_words))

    # Compute cosine similarity
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)


This function calculates the cosine similarity between two strings using the Bag-of-Words model. It tokenizes the strings, creates bag of words for each string, computes the dot product, and then computes the magnitudes of the vectors. Finally, it calculates the cosine similarity using the dot product and magnitudes.


In [60]:
# Sample usage
test_question = "What is the capital of France?"
answer = "The capital is Paris"
predicted_answer = 'The French are the only ones who can afford to pay their taxes. They are the only ones who can afford to pay their taxes.'
predicted_answer = " The capital Paris"
# Calculate Exact Match
exact_match_value = int(predicted_answer.strip() == answer.strip())
# Calculate BLEU Score
#bleu = sentence_bleu([answer.split()], predicted_answer.split())
print(f"Test Question: {test_question}")
print(f"Predicted Answer: {predicted_answer}")
print(f"Ground-Truth Answer: {answer}")
print("BLEU Score:", bleu_score(answer, predicted_answer))
print("ROUGE-1 Score:", rouge_n(answer, predicted_answer, 1))
# You can call rouge_n with n=2 for ROUGE-2 score
print("ROUGE-2 Score:", rouge_n(answer, predicted_answer, 2))
print("F1 Score:", f1_score_a(answer, predicted_answer))
print("F1 Score:", f1_score(answer, predicted_answer))
print("Cosine Similarity:", cosine_similarity(answer, predicted_answer))
print("Exact Match Ratio:", exact_match(answer, predicted_answer))
print("exact_match_value",exact_match_value)


Test Question: What is the capital of France?
Predicted Answer:  The capital Paris
Ground-Truth Answer: The capital is Paris
BLEU Score: 0.5066641486392106
ROUGE-1 Score: {'rouge1': AggregateScore(low=Score(precision=1.0, recall=0.75, fmeasure=0.8571428571428571), mid=Score(precision=1.0, recall=0.75, fmeasure=0.8571428571428571), high=Score(precision=1.0, recall=0.75, fmeasure=0.8571428571428571))}
ROUGE-2 Score: {'rouge2': AggregateScore(low=Score(precision=0.5, recall=0.3333333333333333, fmeasure=0.4), mid=Score(precision=0.5, recall=0.3333333333333333, fmeasure=0.4), high=Score(precision=0.5, recall=0.3333333333333333, fmeasure=0.4))}
F1 Score: 0.8571428571428571
F1 Score: 0.8571428571428571
Cosine Similarity: 0.8660254037844387
Exact Match Ratio: 0.0
exact_match_value 0


In [50]:
# Example usage:
answer = "The cat sat on the mat"
predicted_answer = "The cat on the mat"
score = bleu_score(answer, predicted_answer, n=2)
print("BLEU Score:", score)

BLEU Score: 0.7090416310250969


In [57]:
from collections import Counter
import math

def cosine_similarity(str1, str2):
    """
    Computes the cosine similarity between two strings using the Bag-of-Words model.

    Args:
        str1: The first string.
        str2: The second string.

    Returns:
        A float representing the cosine similarity between the two strings.
    """
    # Tokenize the strings
    tokens1 = str1.split()
    tokens2 = str2.split()

    # Create bag of words for each string
    bow1 = Counter(tokens1)
    bow2 = Counter(tokens2)

    # Get the set of all unique words
    all_words = set(bow1.keys()).union(set(bow2.keys()))

    # Compute dot product
    dot_product = sum(bow1[word] * bow2[word] for word in all_words)

    # Compute magnitudes
    magnitude1 = math.sqrt(sum(bow1[word] ** 2 for word in all_words))
    magnitude2 = math.sqrt(sum(bow2[word] ** 2 for word in all_words))

    # Compute cosine similarity
    if magnitude1 == 0 or magnitude2 == 0:
        return 0
    else:
        return dot_product / (magnitude1 * magnitude2)

# Example usage:
str1 = "apple banana mango"
str2 = "banana orange"
str2 = "apple banana"
similarity = cosine_similarity(str1, str2)
print("Cosine Similarity:", similarity)




Cosine Similarity: 0.8164965809277259
