In [13]:
from IPython.display import HTML
from data import documents, distributions
from src.vectorstore.get import embeddings_ft, embeddings
from ragas.testset.generator import TestsetGenerator
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from ragas.testset.evolutions import simple, reasoning, multi_context
from dotenv import load_dotenv
load_dotenv()




True

### Let's test this fine tuned embedding model.

In [14]:

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)

embedding nodes:   3%|▎         | 8/284 [00:00<00:14, 19.55it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Blueprint for an AI Bill of Rights', 'Automated systems', 'American people']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Pre-deployment testing', 'GAI applications', 'Structured public feedback', 'Measurement gaps', 'AI Red-teaming']}
embedding nodes:   4%|▍         | 11/284 [00:00<00:18, 15.07it/s][ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['AI Bill of Rights', 'White House Office of Science and Technology Policy', 'Automated Systems', 'Civil Rights', 'Democratic Values']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['AI model', 'ML explanation results', 'Privacy risk', 'Data provenance', 'Training data samples']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['GAI risks', 'Adversarial interactions', 'Deepfake images', 'Disinformation on societal trust', 'Risk management resources']}
[ragas.testset.extractor.DEBUG] topic

In [15]:
testset
test_df = testset.to_pandas()
test_df.head(2)

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the significance of the Executive Orde...,[ \n \n \n \nENDNOTES\n1.The Executive Order O...,The Executive Order On Advancing Racial Equity...,simple,[{'source': '/Users/richardlai/Documents/MyPro...,True
1,How can organizations verify information shari...,[ \n20 \nGV-4.3-003 \nVerify information shari...,Organizations can verify information sharing a...,simple,[{'source': '/Users/richardlai/Documents/MyPro...,True


In [16]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [18]:
from src.vectorstore.get import retriever_ft
from src.agents.rag_agent import RagAgent
from langchain_openai import ChatOpenAI

answers = []
contexts = []
llm = ChatOpenAI(model="gpt-4o", temperature=0)
agent = RagAgent(llm, retriever_ft)
chain = agent.get_chain()
for question in test_questions:
  response = chain.invoke({"question" : question })
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [19]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]



In [20]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [23]:
results = evaluate(response_dataset, metrics)

Evaluating: 100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


In [24]:
import pandas as pd
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'fine-tuned-embedding'])
df_baseline

Unnamed: 0,Metric,fine-tuned-embedding
0,faithfulness,0.8961
1,answer_relevancy,0.922825
2,context_recall,0.966667
3,context_precision,0.898611
4,answer_correctness,0.624904
