In [59]:
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import pandas as pd


In [60]:
# Load sentence transformer model once
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_bert_score(generated: str, target: str) -> float:
    P, R, F1 = bert_score([generated], [target], lang='en', verbose=False)
    return F1[0].item()

def compute_bleu_score(generated: str, target: str) -> float:
    reference = [target.split()]
    hypothesis = generated.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothing)

def compute_rouge1_score(generated: str, target: str) -> float:
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(target, generated)
    return scores['rouge1'].fmeasure

def compute_semantic_similarity(generated: str, target: str) -> float:
    emb1 = sbert_model.encode(generated, convert_to_tensor=True)
    emb2 = sbert_model.encode(target, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2)
    return cosine_sim.item()

def calculate_f1(expected, generated):
    
    # Calculate F1 Score
    expected_tokens = expected.lower().split()
    generated_tokens = generated.lower().split()
    
    common_tokens = set(expected_tokens) & set(generated_tokens)
    
    if len(common_tokens) == 0:
        f1 = 0.0
    else:
        precision = len(common_tokens) / len(generated_tokens)
        recall = len(common_tokens) / len(expected_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

In [61]:
def evaluate(filename):
    data = pd.read_csv(filename)
    generated = list(data["Generated Answer"])
    targets = list(data["Expected Summary"])

    bert = 0
    bleu = 0
    rouge = 0
    ss = 0
    f1 = 0
    for i in tqdm(range(len(generated))):
        answer = generated[i]
        target = targets[i]
        bert += compute_bert_score(answer, target)
        bleu += compute_bleu_score(answer, target)
        rouge += compute_rouge1_score(answer, target)
        ss += compute_semantic_similarity(answer, target)
        f1 += calculate_f1(target, answer)

    print(f"Final scores for {filename}:")

    print(f"BERT Score: {bert/len(generated):.5f}")
    print(f"BLEU Score: {bleu/len(generated):.5f}")
    print(f"ROUGE1 Score: {rouge/len(generated):.5f}")
    print(f"Semantic Similarity Score: {ss/len(generated):.5f}")
    print(f"F1 Score: {f1/len(generated):.5f}")



In [41]:
evaluate("baselineRAGresults.csv")

  0%|          | 0/55 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/55 [00:02<02:15,  2.51s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▎         | 2/55 [00:05<02:18,  2.61s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/55 [00:07<02:06,  2.42s/it]Some weights of RobertaModel were 

Final scores for baselineRAGresults.csv:
BERT Score: 0.82754
BLEU Score: 0.01617
ROUGE1 Score: 0.20950
Semantic Similarity Score: 0.57187
F1 Score: 0.10154





In [None]:
evaluate("finetunedRAGresults.csv")


  0%|          | 0/55 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/55 [00:01<01:45,  1.96s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▎         | 2/55 [00:03<01:44,  1.98s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/55 [00:05<01:34,  1.82s

Final scores for finetunedRAGresults copy.csv:
BERT Score: 0.85014
BLEU Score: 0.04093
ROUGE1 Score: 0.25526
Semantic Similarity Score: 0.66643
F1 Score: 0.16103





In [None]:
evaluate("baselineGraphrag.csv")


  0%|          | 0/55 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/55 [00:01<01:30,  1.67s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▎         | 2/55 [00:03<01:34,  1.78s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/55 [00:05<01:27,  1.68s

Final scores for baselineGraphrag copy.csv:
BERT Score: 0.87105
BLEU Score: 0.03647
ROUGE1 Score: 0.29712
Semantic Similarity Score: 0.75946
F1 Score: 0.16993





In [58]:
evaluate("CAGRA results.csv")


  0%|          | 0/55 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/55 [00:03<02:45,  3.07s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▎         | 2/55 [00:05<02:22,  2.68s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/55 [00:07<02:07,  2.45s

Final scores for CAGRA results.csv:
BERT Score: 0.87115
BLEU Score: 0.03775
ROUGE1 Score: 0.30254
Semantic Similarity Score: 0.76356
F1 Score: 0.17145





In [None]:
12, 16, 22, 26, 30, 36, 39, 46, 52, 60