In [1]:
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load sentence transformer model once
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_bert_score(generated: str, target: str) -> float:
    P, R, F1 = bert_score([generated], [target], lang='en', verbose=False)
    return F1[0].item()

def compute_bleu_score(generated: str, target: str) -> float:
    reference = [target.split()]
    hypothesis = generated.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothing)

def compute_rouge1_score(generated: str, target: str) -> float:
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(target, generated)
    return scores['rouge1'].fmeasure

def compute_semantic_similarity(generated: str, target: str) -> float:
    emb1 = sbert_model.encode(generated, convert_to_tensor=True)
    emb2 = sbert_model.encode(target, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2)
    return cosine_sim.item()

def calculate_f1(expected, generated):
    
    # Calculate F1 Score
    expected_tokens = expected.lower().split()
    generated_tokens = generated.lower().split()
    
    common_tokens = set(expected_tokens) & set(generated_tokens)
    
    if len(common_tokens) == 0:
        f1 = 0.0
    else:
        precision = len(common_tokens) / len(generated_tokens)
        recall = len(common_tokens) / len(expected_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

In [5]:
def evaluate(filename):
    data = pd.read_csv(filename)
    generated = list(data["Generated Answer"])
    targets = list(data["Expected Summary"])

    bert = 0
    bleu = 0
    rouge = 0
    ss = 0
    f1 = 0
    for i in tqdm(range(len(generated))):
        answer = generated[i]
        target = targets[i]
        bert += compute_bert_score(answer, target)
        bleu += compute_bleu_score(answer, target)
        rouge += compute_rouge1_score(answer, target)
        ss += compute_semantic_similarity(answer, target)
        f1 += calculate_f1(target, answer)

    print(f"Final scores for {filename}:")

    print(f"BERT Score: {bert/len(generated):.5f}")
    print(f"BLEU Score: {bleu/len(generated):.5f}")
    print(f"ROUGE1 Score: {rouge/len(generated):.5f}")
    print(f"Semantic Similarity Score: {ss/len(generated):.5f}")
    print(f"F1 Score: {f1/len(generated):.5f}")



In [6]:
evaluate("baselineRAGresults.csv")

  0%|          | 0/65 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/65 [00:06<07:04,  6.64s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/65 [00:08<04:10,  3.97s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 3/65 [00:10<03:10,  3.08s/it]Some weights of RobertaModel were 

Final scores for baselineRAGresults.csv:
BERT Score: 0.81750
BLEU Score: 0.00787
ROUGE1 Score: 0.08559
Semantic Similarity Score: 0.47225
F1 Score: 0.06251





In [7]:
evaluate("finetunedRAGresults.csv")

  0%|          | 0/65 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/65 [00:02<02:16,  2.14s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/65 [00:04<02:10,  2.07s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 3/65 [00:06<02:18,  2.23s/it]Some weights of RobertaModel were 

Final scores for finetunedRAGresults.csv:
BERT Score: 0.82519
BLEU Score: 0.01530
ROUGE1 Score: 0.12965
Semantic Similarity Score: 0.52350
F1 Score: 0.09845





In [8]:
evaluate("baselineGraphrag.csv")

  0%|          | 0/65 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/65 [00:02<02:16,  2.13s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/65 [00:04<02:14,  2.14s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 3/65 [00:06<02:01,  1.95s/it]Some weights of RobertaModel were 

Final scores for baselineGraphrag.csv:
BERT Score: 0.83166
BLEU Score: 0.01536
ROUGE1 Score: 0.11823
Semantic Similarity Score: 0.55319
F1 Score: 0.08686





In [11]:
evaluate("finetunedGraphrag_1.csv")


  0%|          | 0/65 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/65 [00:02<02:30,  2.35s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/65 [00:04<02:27,  2.35s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▍         | 3/65 [00:06<02:13,  2.16s

Final scores for finetunedGraphrag_1.csv:
BERT Score: 0.82988
BLEU Score: 0.01557
ROUGE1 Score: 0.12523
Semantic Similarity Score: 0.56335
F1 Score: 0.09231



