# Evaluation -> RAG Triad + Answer Correctness
## For -> LlaMa + Chroma (Embedding model used: BAAI/bge-large-en-v1.5)
### Shruti

In [None]:
# pip install pandas numpy transformers spacy

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
# !pip install ipywidgets

In [None]:
import spacy
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string

# Load spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Load model and tokenizer
model_name = 'BAAI/bge-large-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
import pandas as pd

df = pd.read_csv('output.csv')
df

Unnamed: 0,question,ground_truth,rag_answer,context,possible_questions
0,What are the keywords mentioned in the paper t...,The keywords written in the paper are: Earthqu...,"The keywords mentioned in the paper titled ""Re...",['Response Prediction of Structural System Sub...,['What are the key keywords mentioned in the p...
1,What is the purpose of using Artificial Neural...,Artificial Neural Networks (ANNs) are used to ...,\nThe purpose of using Artificial Neural Netwo...,['Response Prediction of Structural System Sub...,['What is the main purpose of using Artificial...
2,How does the frequency of a building's natural...,A building's response to an earthquake is dyna...,The frequency of a building's natural frequenc...,['1 Introduction \n \nReal earthquake ground m...,"[""How does a building's natural frequency affe..."
3,What kind of neural network model is most freq...,The most frequently applied neural network mod...,\nThe most frequently applied neural network m...,['1 Introduction \n \nReal earthquake ground m...,['What type of neural network model is most co...
4,How was the training of the ANN model conducte...,The ANN model was trained using real earthquak...,\nThe training of the ANN model for predicting...,['Response Prediction of Structural System Sub...,['What data was used to train the ANN model fo...
5,What is the significance of the study's findin...,The study's findings demonstrate the ability o...,The study's findings on predicting the safenes...,['Response Prediction of Structural System Sub...,"[""What are the key implications of the study's..."


In [None]:
# Function to calculate the embedding of a given text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Reshape the tensor to remove the batch dimension and detach it from the computation graph
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().squeeze()

# Function to calculate cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    # Ensure the embeddings are 2D
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

# Ccalculation for the advanced NLP tasks using spaCy
def calculate_context_relevancy(row):
    doc = nlp(row['context'])
    question_doc = nlp(row['question'])
    question_keywords = {token.lemma_.lower() for token in question_doc if not token.is_stop}
    relevant_sentences = sum(any(token.lemma_.lower() in question_keywords for token in sent) for sent in doc.sents)
    return relevant_sentences / len(list(doc.sents))

# Function to calculate answer relevancy
def calculate_answer_relevancy(row):
    original_question_embedding = get_embedding(row['question'])

    # 'possible_questions' is a list of artificially generated questions from the answer (reverse engineered)
    artificial_questions_embeddings = [get_embedding(q) for q in row['possible_questions']]

    relevancies = [
        calculate_similarity(original_question_embedding, q_emb)
        for q_emb in artificial_questions_embeddings
    ]

    # Calculate the mean of these relevancies to get the answer relevancy score
    answer_relevancy = np.mean(relevancies)
    return answer_relevancy

In [None]:
# Now apply these functions to each row in your dataframe
df['Context Relevance'] = df.apply(calculate_context_relevancy, axis=1)
df['Answer Relevance'] = df.apply(calculate_answer_relevancy, axis=1)
df['Groundedness'] = df.apply(lambda row: calculate_similarity(get_embedding(row['rag_answer']), get_embedding(row['context'])), axis=1)
df['Answer Correctness'] = df.apply(lambda row: calculate_similarity(get_embedding(row['rag_answer']), get_embedding(row['ground_truth'])), axis=1)

# Display the updated DataFrame with the computed metrics
df[['Context Relevance', 'Answer Relevance', 'Groundedness', 'Answer Correctness']]

Unnamed: 0,Context Relevance,Answer Relevance,Groundedness,Answer Correctness
0,0.791667,0.532465,0.946845,0.869912
1,0.65625,0.490528,0.929689,0.94779
2,0.538462,0.48808,0.822761,0.937468
3,0.605263,0.500364,0.819175,0.972624
4,0.791667,0.507974,0.92398,0.934089
5,0.47619,0.514314,0.928755,0.916381
