In [1]:
import os
os.chdir('../')

In [None]:
from langchain_community.vectorstores import FAISS

from llm.chain import get_conversation_chain
from wrappers.langchain_wrappers import VertexAIChat, VertexAIEmbedding

In [26]:
import pandas as pd

normal = pd.read_csv("../data/RAG_evaluation_dataset.csv")

shorter = normal[:2]

In [4]:
shorter.to_csv("../data/RAG_evaluation_dataset_shorter.csv")

In [5]:
FAISS_INDEX_DIR = "../data/faiss_index_with_images"
EVAL_CSV_DIR = "../data/RAG_evaluation_dataset_shorter.csv"

In [None]:
from dotenv import load_dotenv
import google.genai as genai
import os


load_dotenv()

PROJECT_ID = os.getenv("GEMINI_PROJECT")
LOCATION = os.getenv("GEMINI_LOCATION")
USE_VERTEXAI = True

In [37]:
from ragas import SingleTurnSample
from ragas.metrics import Faithfulness, LLMContextPrecisionWithReference, ResponseRelevancy

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

evaluator_llm = LangchainLLMWrapper(VertexAIChat(model="gemini-2.0-flash", temperature=0.0))

evaluator_embeddings = LangchainEmbeddingsWrapper(VertexAIEmbedding())

FAISS_INDEX_PATH = "../data/faiss_index_with_images"

embedding_model = VertexAIEmbedding()
vector_store = FAISS.load_local(FAISS_INDEX_PATH, embedding_model, allow_dangerous_deserialization=True)

In [None]:
avg_f = 0
all_f = []
avg_c = 0
all_c = []
avg_r = 0
all_r = []

for i in range(len(normal)):
    question = normal.iloc[i]['Question']
    reference = normal.iloc[i]['Ground_Truth_Context']

    chain, retriever = get_conversation_chain(
        vector_store=vector_store,
        re_ranker=False,
        faiss=True,
        user_prompt=question
    )

    response = chain({"question": question})

    retrieved_docs = retriever.get_relevant_documents(question)

    retrieved_contexts = []
    for i, doc in enumerate(retrieved_docs, 1):
        metadata = doc.metadata
        retrieved_contexts.append(doc.page_content)

    test_data = {
        "user_input": question,
        "retrieved_contexts": retrieved_contexts,
        "response": response['answer'],
        "reference": reference,
    }

    sample_f = SingleTurnSample(
        user_input=test_data["user_input"],
        retrieved_contexts=test_data['retrieved_contexts'],
        response=test_data['response'],
    )
    sample_c = SingleTurnSample(
        user_input=test_data["user_input"],
        reference=test_data["reference"],
        retrieved_contexts=test_data['retrieved_contexts'],
    )

    faithfulness_metric = Faithfulness(llm=evaluator_llm)
    context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)
    scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)

    faithfulness_score = await faithfulness_metric.single_turn_ascore(sample_f)
    context_precision_score = await context_precision.single_turn_ascore(sample_c)
    response_relevance_score = await scorer.single_turn_ascore(sample_f)

    all_f.append(faithfulness_score)
    avg_f += faithfulness_score

    all_c.append(context_precision_score)
    avg_c += context_precision_score

    all_r.append(response_relevance_score)
    avg_r += response_relevance_score

In [39]:
print(f"Avg Faithfullness = {avg_f / len(normal)}")
print(f"Avg Context Precision = {avg_c / len(normal)}")
print(f"Avg Response Relevance = {avg_r / len(normal)}")

Avg Faithfullness = 0.7518907563025212
Avg Context Precision = 0.5770003988200691
Avg Response Relevance = 0.8534368604970144
