In [2]:
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load sentence transformer model once
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_bert_score(generated: str, target: str) -> float:
    P, R, F1 = bert_score([generated], [target], lang='en', verbose=False)
    return F1[0].item()

def compute_bleu_score(generated: str, target: str) -> float:
    reference = [target.split()]
    hypothesis = generated.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothing)

def compute_rouge1_score(generated: str, target: str) -> float:
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = scorer.score(target, generated)
    return scores['rouge1'].fmeasure

def compute_semantic_similarity(generated: str, target: str) -> float:
    emb1 = sbert_model.encode(generated, convert_to_tensor=True)
    emb2 = sbert_model.encode(target, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2)
    return cosine_sim.item()

def calculate_f1(expected, generated):
    
    # Calculate F1 Score
    expected_tokens = expected.lower().split()
    generated_tokens = generated.lower().split()
    
    common_tokens = set(expected_tokens) & set(generated_tokens)
    
    if len(common_tokens) == 0:
        f1 = 0.0
    else:
        precision = len(common_tokens) / len(generated_tokens)
        recall = len(common_tokens) / len(expected_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

In [4]:
def evaluate(filename):
    data = pd.read_csv(filename)
    generated = list(data["Generated Answer"])
    targets = list(data["Expected Answer"])

    bert = 0
    bleu = 0
    rouge = 0
    ss = 0
    f1 = 0
    for i in tqdm(range(len(generated))):
        answer = generated[i]
        target = targets[i]
        bert += compute_bert_score(answer, target)
        bleu += compute_bleu_score(answer, target)
        rouge += compute_rouge1_score(answer, target)
        ss += compute_semantic_similarity(answer, target)
        f1 += calculate_f1(target, answer)

    print(f"Final scores for {filename}:")

    print(f"BERT Score: {bert/len(generated):.5f}")
    print(f"BLEU Score: {bleu/len(generated):.5f}")
    print(f"ROUGE1 Score: {rouge/len(generated):.5f}")
    print(f"Semantic Similarity Score: {ss/len(generated):.5f}")
    print(f"F1 Score: {f1/len(generated):.5f}")



In [4]:
evaluate("baselineRAGresults.csv")

  0%|          | 0/59 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/59 [00:10<10:32, 10.91s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/59 [00:13<05:33,  5.86s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/59 [00:15<03:50,  4.11s

Final scores for baselineRAGresults.csv:
BERT Score: 0.83452
BLEU Score: 0.00897
ROUGE1 Score: 0.13509
Semantic Similarity Score: 0.39983
F1 Score: 0.09596





In [5]:
evaluate("finetunedRAGresults.csv")

  0%|          | 0/59 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/59 [00:02<02:28,  2.56s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/59 [00:05<02:35,  2.72s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/59 [00:08<02:35,  2.78s

Final scores for finetunedRAGresults.csv:
BERT Score: 0.86363
BLEU Score: 0.03433
ROUGE1 Score: 0.26230
Semantic Similarity Score: 0.51775
F1 Score: 0.19500





In [5]:
evaluate("baselinegraphrag.csv")


  0%|          | 0/59 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/59 [00:02<02:44,  2.84s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/59 [00:05<02:23,  2.52s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/59 [00:07<02:17,  2.45s

Final scores for baselinegraphrag.csv:
BERT Score: 0.86217
BLEU Score: 0.02680
ROUGE1 Score: 0.22821
Semantic Similarity Score: 0.60553
F1 Score: 0.16479





In [None]:
evaluate("CAGRAresults.csv")


  0%|          | 0/59 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 1/59 [00:04<04:23,  4.54s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 2/59 [00:06<03:01,  3.18s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  5%|▌         | 3/59 [00:08<02:30,  2.68s/it]Some weights of RobertaModel were 

Final scores for grapht.csv:
BERT Score: 0.86811
BLEU Score: 0.03516
ROUGE1 Score: 0.28862
Semantic Similarity Score: 0.59791
F1 Score: 0.21699



