In [2]:
# !pip install -U langsmith
# !pip install -U langchain
# !pip install shutup
# !pip install tiktoken
# !pip install nest_asyncio
# !pip install ragas
# !pip install chromadb

In [3]:

import openai
import nest_asyncio

nest_asyncio.apply()

import os


open_ai_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

os.environ['OPENAI_API_KEY'] = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

In [4]:

from langchain.document_loaders import WebBaseLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# create the QA chain
llm = ChatOpenAI(openai_api_key=open_ai_key)

# load the Wikipedia page and create index
loader = WebBaseLoader(['https://en.wikipedia.org/wiki/India_national_cricket_team',
                       'https://en.wikipedia.org/wiki/History_of_the_Indian_cricket_team',
                       'https://en.wikipedia.org/wiki/Cricket',
                       'https://en.wikipedia.org/wiki/List_of_India_ODI_cricketers',
                       'https://en.wikipedia.org/wiki/List_of_India_Test_cricketers'])

# to cater more requests per second
loader.requests_per_second = 2

index = VectorstoreIndexCreator().from_loaders([loader])


qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True
)

# testing it out
question = "Who was first indian test captain and when he was given the position"
result = qa_chain({"query": question})
print(result["result"])

The first Indian Test captain was C.K. Nayudu. He was given the position in the year 1932.


## Under the hood

1. **Faithfulness**: Assessing the precision of the generated response in relation to the provided context involves a two-step process. Initially, when presented with a question and the generated answer, Ragas employs a Language Model (LLM) to identify the assertions made within the generated answer. This results in a compilation of statements that require verification for accuracy. In the second step, Ragas utilizes another Language Model to cross-reference the list of statements with the returned context to determine whether the statements align with the available information. The cumulative count of correct statements is then divided by the total number of statements within the generated answer to determine the score for a specific example.

2. **Answer Relevancy**: This metric gauges the degree of relevance and directness exhibited by an answer in response to a question. In the evaluation process, Ragas employs a Language Model (LLM) to identify potential questions for which the generated answer could serve as a response. Subsequently, it computes the similarity between the generated answer and the actual question posed.

3. **Context Relevancy**: This metric quantifies the ratio of pertinent information to extraneous details within the retrieved contexts. When presented with a question, Ragas leverages LLM to identify the specific sentences within the retrieved context that are essential for answering that question. The score is then derived by comparing the number of necessary sentences to the total number of sentences in the context.

In [5]:
from ragas.metrics import faithfulness, answer_relevancy, context_relevancy
from ragas.langchain import RagasEvaluatorChain

# make eval chains
eval_chains = {
    m.name: RagasEvaluatorChain(metric=m)
    for m in [faithfulness, answer_relevancy, context_relevancy]
}

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/517 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# evaluate
for name, eval_chain in eval_chains.items():
    score_name = f"{name}_score"
    print(f"{score_name}: {eval_chain(result)[score_name]}")

faithfulness_score: 0.0
answer_relevancy_score: 0.9644678668369756
context_ relevancy_score: 0.05404426985316806
