In [98]:
# Imports and Secrets
import os
import numpy as np

from datasets import load_dataset, Dataset
from dotenv import load_dotenv

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate

from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

MAX_PAGES=10

## Datasets

In [99]:
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
# loading the WikiEval dataset
wikieval = load_dataset("explodinggradients/WikiEval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


## Creating the Vector Index

In [100]:
persist_directory = 'db'
force_rebuild = True

if not os.path.exists(f"./{persist_directory}") or force_rebuild:
    pages_limit = MAX_PAGES

    all_docs = None
    for idx, sample in enumerate(wikieval["train"]):
        if idx >= pages_limit:
            break
        wikipedia_page = sample["source"]
        
        loader = WikipediaLoader(
            query=wikipedia_page,
            load_max_docs=1,
            doc_content_chars_max=100000
        )
        docs = loader.load()
        
        if all_docs is None:
            all_docs = docs
        else:
            all_docs += docs

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(all_docs)

    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory=persist_directory)
else:
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())

In [101]:
print(len(all_docs))
docs_sz = np.array([len(docs.page_content) for docs in all_docs])
print(docs_sz.max())
all_docs[2].page_content[:100]

10
58797


'MedMen Enterprises was a United States-based cannabis company. At its peak, it had operations in Cal'

## Naive RAG

In [102]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

retrieve_docs = (lambda x: x["question"]) | retriever

chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
    answer=rag_chain_from_docs
)

# response = chain.invoke({"question": "Who directed the film Oppenheimer and who stars as J. Robert Oppenheimer in the film?"})

def naive_rag_pipeline(question, chain=chain):
    response = chain.invoke({"question": question})
    
    build_context = []
    for ctx in response["context"]:
        build_context.append(ctx.page_content)

    return response["question"], build_context, response["answer"]

In [103]:
dataset_dict = {"question": [], "contexts": [], "answer": [], "ground_truth": []}
for sample in list(wikieval["train"])[:MAX_PAGES]:
    print(sample)
    question, contexts, answer = naive_rag_pipeline(sample["question"][10:])
    dataset_dict["question"].append(question)
    dataset_dict["contexts"].append(contexts)
    dataset_dict["answer"].append(answer)
    dataset_dict["ground_truth"].append(sample["answer"][8:])

# dataset_dict

{'answer': 'Answer: The PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India.', 'question': 'Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?', 'context_v1': ["The PSLV-C56 is the 58th mission of Indian Space Research Organisation's Polar Satellite Launch Vehicle (PSLV) and the 17th flight of the PSLV-CA variant, and will be get launched from Satish Dhawan Space Centre First Launch Pad ( FLP ).\n\nLaunch\nIt is Scheduled to get launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC from Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India. This is a dedicated commercial mission through NSIL with DS-SAR as primary satellite and VELOX-AM as a co-passenger satellite With other 5 Satellites, All satellites from this mission belongs to Singapore."], 'context_v2': ["The PSLV-C56 is 

## Ragas Evaluation

In [104]:


dataset = Dataset.from_dict(dataset_dict)
result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

result

Evaluating: 100%|██████████| 40/40 [01:17<00:00,  1.94s/it]


{'context_precision': 0.9639, 'faithfulness': 0.7042, 'answer_relevancy': 0.9451, 'context_recall': 0.8500}

In [105]:
df = result.to_pandas()
df.to_csv("naive.csv")
df.head()

Unnamed: 0,question,contexts,answer,ground_truth,context_precision,faithfulness,answer_relevancy,context_recall
0,When is the scheduled launch date and time for...,[The PSLV-C56 was the 58th mission of Indian S...,"The PSLV-C56 mission was launched on Sunday, J...",The PSLV-C56 mission is scheduled to be launch...,1.0,0.0,0.928534,1.0
1,What is the objective of the Uzbekistan-Afghan...,[The Uzbekistan–Afghanistan–Pakistan Railway P...,The objective of the Uzbekistan-Afghanistan-Pa...,The objective of the Uzbekistan-Afghanistan-Pa...,1.0,1.0,0.957029,1.0
2,When was PharmaCann founded and what is its he...,"[In May 2018, MedMen began construction on a s...",PharmaCann was founded in 2014 and its headqua...,PharmaCann was founded in 2014 by Theodore Sco...,1.0,0.0,0.99035,0.5
3,Who directed the film Oppenheimer and who star...,[Oppenheimer is a 2023 epic biographical thril...,Christopher Nolan directed the film Oppenheime...,Christopher Nolan directed the film Oppenheime...,1.0,1.0,0.980462,1.0
4,What is theranostics and how does it combine d...,[== Therapeutic approaches ==\nTheranostics en...,Theranostics combines diagnostic and therapeut...,"Theranostics, also known as theragnostics, is ...",1.0,0.375,0.91277,1.0
