In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import json
import pandas as pd

from ragas import evaluate, EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from langchain_openai.chat_models import ChatOpenAI

from src.llm.openai_ import OpenLLM
from src.vec_database.dense import DenseDatabase
from src.embedding.fastembed_ import MiniLmEmbedding
from src.embedding.qwen import QwenEmbeddingSmall
from src.embedding.openai_ import OpenEmbeddingSmall
from src.util import answer_questions

from config import OPENAI_KEY, DENSE_CONN_STRING

In [3]:
df_ideal = pd.read_csv('../data/ideal_answers.csv')

In [4]:
df_ideal.head()

Unnamed: 0,Question,Source Docs,Question Type,Source Chunk Type,Answer
0,How has Apple's total net sales changed over t...,*AAPL*,Multi-Doc RAG,Table,"Based on the provided documents, Apple's total..."
1,What are the major factors contributing to the...,*AAPL*,Multi-Doc RAG,Text,In the most recent 10-Q for the quarter ended ...
2,Has there been any significant change in Apple...,*AAPL*,Multi-Doc RAG,Table,"Yes, there has been a change in Apple's operat..."
3,How has Apple's revenue from iPhone sales fluc...,*AAPL*,Multi-Doc RAG,Table,The revenue from iPhone sales for Apple has fl...
4,Can any trends be identified in Apple's Servic...,*AAPL*,Multi-Doc RAG,Table,"Based on the provided documents, there is a tr..."


In [5]:
questions = df_ideal['Question'].tolist()
references = df_ideal['Answer'].tolist()

In [6]:
base_llm_name = 'gpt-4.1-nano'
base_llm = OpenLLM(base_llm_name, OPENAI_KEY)
available_dense_collections = {
    MiniLmEmbedding(): ('1024-txt-minilm', 60),
    QwenEmbeddingSmall(): ('1024-txt-qwen', 60),
    OpenEmbeddingSmall(OPENAI_KEY): ('1024-txt-openai', 60)
}

with open('../instructions.json') as file:
    base_instructions = str(json.load(file))

In [7]:
for embedding, params in available_dense_collections.items():
    database = DenseDatabase(DENSE_CONN_STRING, embedding)
    collection_name, batch_size = params

    dataset = list()

    try:
        answers = await answer_questions(base_llm, base_instructions, database, collection_name, retrieve_limit=10, questions=questions, batch_size=batch_size)

        for question, answer, reference in zip(questions, answers, references):
            response, retrieved_contexts = answer
            dataset.append(
                {
                    "user_input": question,
                    "retrieved_contexts": [str(chunk.model_dump()) for chunk in retrieved_contexts],
                    "response": response,
                    "reference": reference
                }
            )
    finally:
        await database.client.close()

    with open(f'../data/system_answers/{collection_name}.json', 'w') as file:
        json.dump(dataset, file, indent=4)

In [25]:
database = DenseDatabase(DENSE_CONN_STRING, MiniLmEmbedding())

dataset = list()

try:
    answers = await answer_questions(
        OpenLLM('gpt-4.1-mini', OPENAI_KEY),
        base_instructions, database, '1024-txt-minilm',
        retrieve_limit=20, questions=questions, batch_size=30
    )

    for question, answer, reference in zip(questions, answers, references):
        response, retrieved_contexts = answer
        dataset.append(
            {
                "user_input": question,
                "retrieved_contexts": [str(chunk.model_dump()) for chunk in retrieved_contexts],
                "response": response,
                "reference": reference
            }
        )
finally:
    await database.client.close()

with open(f'../data/system_answers/1024-txt-minilm-dense-gpt-4.1-mini.json', 'w') as file:
    json.dump(dataset, file, indent=4)

In [27]:
result = evaluate(
    dataset=EvaluationDataset.from_list(dataset),
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=LangchainLLMWrapper(ChatOpenAI(api_key=OPENAI_KEY, model='gpt-4o-mini', temperature=0))
)

Evaluating:   0%|          | 0/585 [00:00<?, ?it/s]

Exception raised in Job[293]: TimeoutError()
Exception raised in Job[553]: InternalServerError(upstream connect error or disconnect/reset before headers. reset reason: connection termination)


{'context_recall': 0.5272, 'faithfulness': 0.9145, 'factual_correctness(mode=f1)': 0.2858}
