In [None]:
import pandas as pd
import re
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

def extract_text(results):
    pattern = re.compile(r'Response\s*:\s*({.*?})', re.DOTALL)
    extracted_texts = []
    count = 0
    for text in results:
        match = re.search(pattern, text)
        if match:
            extracted_texts.append(match.group(1))
        else:
            extracted_texts.append("")
            count += 1
    print(f"Pattern not found for {count} results.")
    return extracted_texts

def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])
    print(f'BLEU Score: {bleu_score}')

def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    print(f'Average ROUGE-1: {average_rouge1}')
    print(f'Average ROUGE-2: {average_rouge2}')
    print(f'Average ROUGE-L: {average_rougeL}')

def calculate_bert_score(generated_answers, ground_truth):
    scorer = BERTScorer(model_type='bert-base-uncased')
    P, R, F1 = scorer.score(generated_answers, ground_truth)
    # for i, (p, r, f1) in enumerate(zip(P, R, F1)):
    #     print(f"Pair {i + 1} - BERTScore Precision: {p.mean():.4f}, Recall: {r.mean():.4f}, F1: {f1.mean():.4f}")
    avg_precision = sum(p.mean() for p in P) / len(P)
    avg_recall = sum(r.mean() for r in R) / len(R)
    avg_f1 = sum(f1.mean() for f1 in F1) / len(F1)
    print(f"\nAverage BERTScore - Precision: {avg_precision:.4f}, Recall: {avg_recall:.4f}, F1: {avg_f1:.4f}")


In [None]:
# Read data
df = pd.read_csv('data_final_test.csv')
print(len(df))
machine_results_Finetuned = list(df["Result_Finetuned"])
machine_results_Finetuned_witout_vllm_weights = list(df["Result_Finetuned_without_vllm_weights"])
machine_results_Non_Finetuned = list(df["Result_Non_Finetuned"])
reference_texts = list(df["Ground_Truth"])



In [None]:
machine_results_Finetuned[10]

In [None]:
machine_results_Non_Finetuned[10]

In [None]:
reference_texts[10]

In [None]:
# Extract text from results
machine_results_Finetuned_Copy = extract_text(machine_results_Finetuned)
machine_results_Finetuned_witout_vllm_weights_Copy = extract_text(machine_results_Finetuned_witout_vllm_weights)
machine_results_Non_Finetuned_Copy = extract_text(machine_results_Non_Finetuned)


In [None]:
# Calculate BLEU score
print("BLEU Score for Finetuned:")
calculate_bleu_score(machine_results_Finetuned_Copy, reference_texts)
print("BLEU Score for Finetuned without vllm weights:")
calculate_bleu_score(machine_results_Finetuned_witout_vllm_weights_Copy, reference_texts)
print("BLEU Score for Non-Finetuned:")
calculate_bleu_score(machine_results_Non_Finetuned_Copy, reference_texts)



In [None]:
# Calculate ROUGE scores
print("ROUGE Scores for Finetuned:")
calculate_rouge_scores(machine_results_Finetuned_Copy, reference_texts)
print("ROUGE Scores for Finetuned without vllm weights:")
calculate_rouge_scores(machine_results_Finetuned_witout_vllm_weights_Copy, reference_texts)
print("ROUGE Scores for Non-Finetuned:")
calculate_rouge_scores(machine_results_Non_Finetuned_Copy, reference_texts)



In [None]:
# Calculate BERTScore
print("BERTScores for Finetuned:")
calculate_bert_score(machine_results_Finetuned_Copy, reference_texts)

print("BERTScores for Finetuned without vllm weights:")
calculate_bert_score(machine_results_Finetuned_witout_vllm_weights_Copy, reference_texts)


In [None]:
print("BERTScores for Non-Finetuned:")
calculate_bert_score(machine_results_Non_Finetuned_Copy, reference_texts)