In [7]:
!pip install langchain-community pypdf

In [None]:
!pip install langchain-openai

In [None]:
!pip install chromadb

In [None]:
!pip install datasets

In [None]:
!pip install ragas

In [46]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/RAG.pdf")

In [8]:
docs = loader.load()

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter  = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
)

chunks = text_splitter.split_documents(docs)

In [13]:
import os
os.environ["OPENAI_API_KEY"]="sk-prXF_XizkA"

In [16]:
# Setting up the vector store
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(
    documents = chunks,
    collection_name = 'evaluation',
    embedding = OpenAIEmbeddings()
)

In [17]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever.invoke("What is Naive RAG?")

[Document(metadata={'page': 1, 'source': '/content/RAG.pdf'}, page_content='RAG, and Modular RAG, as showed in Figure 3. Despite\nRAG method are cost-effective and surpass the performance\nof the native LLM, they also exhibit several limitations.\nThe development of Advanced RAG and Modular RAG is\na response to these specific shortcomings in Naive RAG.\nA. Naive RAG\nThe Naive RAG research paradigm represents the earli-\nest methodology, which gained prominence shortly after the'),
 Document(metadata={'page': 15, 'source': '/content/RAG.pdf'}, page_content='external knowledge bases. The survey showcases the evolution\nof RAG technologies and their application on many different\ntasks. The analysis outlines three developmental paradigms\nwithin the RAG framework: Naive, Advanced, and Modu-\nlar RAG, each representing a progressive enhancement over\nits predecessors. RAG’s technical integration with other AI\nmethodologies, such as fine-tuning and reinforcement learning,\nhas further ex

In [18]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough


# Prompt
prompt_template = """You are an assistant for question-answering tasks who answers questions based
only on the context that are provided to you.
If you don't know the answer, just say that you don't know.
Follow these instructions strictly:

- Use three sentences maximum and keep the answer concise.
- Do not make up anything from your end, only refer to the context provided for answer generation
- If the context doesn't have required information to answer the question, respond with "I do not know"

question: {question}
search_results: {context}
Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

# llm

llm = ChatOpenAI(model = 'gpt-4o')

# combining the retrieved docs
def format_docs(docs):
    if not docs:
        return ""
    return "\n\n".join(doc.page_content for doc in docs if doc.page_content)

In [19]:
rag_chain = (
    {'context': retriever | format_docs , 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [20]:
rag_chain.invoke(input = "What is Naive RAG?")

'Naive RAG represents the earliest methodology within the RAG framework, which gained prominence shortly after its introduction. It is part of a progressive enhancement in the RAG technologies, encompassing Naive, Advanced, and Modular RAG paradigms. The development of Advanced and Modular RAG aims to address specific shortcomings identified in Naive RAG.'

In [24]:
# Prepare the completed answers in a DataFrame
completed_data = [
    {
        "question": "What is Retrieval-Augmented Generation (RAG)?",
        "answer": "RAG enhances LLMs by incorporating knowledge from external databases, improving accuracy and reducing hallucination."
    },
    {
        "question": "What are the three main paradigms of RAG?",
        "answer": "The three main paradigms of RAG are Naive RAG, Advanced RAG, and Modular RAG."
    },
    {
        "question": "What is the primary purpose of RAG in large language models?",
        "answer": "The primary purpose is to reduce hallucination, integrate domain-specific knowledge, and improve contextual understanding."
    },
    {
        "question": "What are the three core components of RAG?",
        "answer": "The core components of RAG are Retrieval, Generation, and Augmentation."
    },
    {
        "question": "What is the drawback of Naive RAG in the retrieval phase?",
        "answer": "Naive RAG struggles with precision and recall, often leading to irrelevant or redundant information."
    },
    {
        "question": "How does Advanced RAG improve retrieval quality?",
        "answer": "Advanced RAG uses pre-retrieval optimization like metadata and indexing refinement, ensuring more accurate and relevant retrievals."
    },
    {
        "question": "What is Modular RAG known for?",
        "answer": "Modular RAG introduces specialized components, such as search and prediction modules, offering greater flexibility and performance for various tasks."
    },
    {
        "question": "What is the role of metadata in RAG indexing?",
        "answer": "Metadata like page numbers and timestamps help filter and prioritize relevant chunks, improving retrieval accuracy and efficiency."
    },
    {
        "question": "What are the main evaluation metrics for RAG retrieval quality?",
        "answer": "Metrics include Hit Rate, Mean Reciprocal Rank (MRR), and Normalized Discounted Cumulative Gain (NDCG)."
    },
    {
        "question": "Why is RAG preferred over fine-tuning for dynamic environments?",
        "answer": "RAG allows real-time updates and external knowledge integration, making it more adaptable and scalable for evolving data and tasks."
    }
]

# Convert to a DataFrame
completed_df = pd.DataFrame(completed_data)

# Save the DataFrame as a CSV file
testset = "testset.csv"
completed_df.to_csv(testset, index=False)



In [26]:
import pandas as pd
df = pd.read_csv("testset.csv")

In [27]:
df.head()

Unnamed: 0,question,answer
0,What is Retrieval-Augmented Generation (RAG)?,RAG enhances LLMs by incorporating knowledge f...
1,What are the three main paradigms of RAG?,"The three main paradigms of RAG are Naive RAG,..."
2,What is the primary purpose of RAG in large la...,The primary purpose is to reduce hallucination...
3,What are the three core components of RAG?,"The core components of RAG are Retrieval, Gene..."
4,What is the drawback of Naive RAG in the retri...,"Naive RAG struggles with precision and recall,..."


In [28]:
from datasets import Dataset

# Load the completed CSV file as a DataFrame
completed_df = pd.read_csv("/content/testset.csv")

# Extract questions and ground truth (answers) from the DataFrame
questions = completed_df["question"].tolist()
ground_truth = completed_df["answer"].tolist()

# Setting up the schema for eval dataset
data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}

# Mocking the function calls for `rag_chain.invoke` and `retriever.invoke`
# Since we cannot execute these in this environment, placeholders are used.

def mock_rag_chain_invoke(query):
    # Mock response for RAG generated answers
    return f"Generated answer for: {query}"

def mock_retriever_invoke(query):
    # Mock response for retrieved contexts
    return [f"Retrieved context 1 for: {query}", f"Retrieved context 2 for: {query}"]

# Creating the eval dataset
for query in questions:
    # The question
    data["question"].append(query)
    # The Rag generated answer (mocked here)
    data["answer"].append(mock_rag_chain_invoke(query))
    # The retrieved contexts (mocked here)
    data["contexts"].append(mock_retriever_invoke(query))

# Create a Dataset from the data dictionary
eval_dataset = Dataset.from_dict(data)

# Display the eval dataset structure for verification
eval_dataset_dict = eval_dataset.to_dict()
eval_dataset_dict


{'question': ['What is Retrieval-Augmented Generation (RAG)?',
  'What are the three main paradigms of RAG?',
  'What is the primary purpose of RAG in large language models?',
  'What are the three core components of RAG?',
  'What is the drawback of Naive RAG in the retrieval phase?',
  'How does Advanced RAG improve retrieval quality?',
  'What is Modular RAG known for?',
  'What is the role of metadata in RAG indexing?',
  'What are the main evaluation metrics for RAG retrieval quality?',
  'Why is RAG preferred over fine-tuning for dynamic environments?'],
 'answer': ['Generated answer for: What is Retrieval-Augmented Generation (RAG)?',
  'Generated answer for: What are the three main paradigms of RAG?',
  'Generated answer for: What is the primary purpose of RAG in large language models?',
  'Generated answer for: What are the three core components of RAG?',
  'Generated answer for: What is the drawback of Naive RAG in the retrieval phase?',
  'Generated answer for: How does Adva

In [30]:
df1 = pd.DataFrame(eval_dataset)
df1.head()

Unnamed: 0,question,answer,contexts,ground_truth
0,What is Retrieval-Augmented Generation (RAG)?,Generated answer for: What is Retrieval-Augmen...,[Retrieved context 1 for: What is Retrieval-Au...,RAG enhances LLMs by incorporating knowledge f...
1,What are the three main paradigms of RAG?,Generated answer for: What are the three main ...,[Retrieved context 1 for: What are the three m...,"The three main paradigms of RAG are Naive RAG,..."
2,What is the primary purpose of RAG in large la...,Generated answer for: What is the primary purp...,[Retrieved context 1 for: What is the primary ...,The primary purpose is to reduce hallucination...
3,What are the three core components of RAG?,Generated answer for: What are the three core ...,[Retrieved context 1 for: What are the three c...,"The core components of RAG are Retrieval, Gene..."
4,What is the drawback of Naive RAG in the retri...,Generated answer for: What is the drawback of ...,[Retrieved context 1 for: What is the drawback...,"Naive RAG struggles with precision and recall,..."


In [31]:
# Getting contexts for the question
contexts = []
for i in retriever.invoke("What is Retrieval-Augmented Generation (RAG)?"):
    contexts.append(i.page_content)

In [34]:
from ragas import SingleTurnSample

# Creating a single turn sample object
sample = SingleTurnSample(
    user_input="What is Retrieval-Augmented Generation (RAG)?",
    reference="RAG enhances LLMs by incorporating knowledge from external databases, improving accuracy and credibility for knowledge-intensive tasks.",
    retrieved_contexts=contexts,
    response = rag_chain.invoke("What is Retrieval-Augmented Generation (RAG)?"))

print(sample.to_dict())

{'user_input': 'What is Retrieval-Augmented Generation (RAG)?', 'retrieved_contexts': ['1\nRetrieval-Augmented Generation for Large\nLanguage Models: A Survey\nYunfan Gaoa, Yun Xiongb, Xinyu Gao b, Kangxiang Jia b, Jinliu Pan b, Yuxi Bic, Yi Dai a, Jiawei Sun a, Meng\nWangc, and Haofen Wang a,c\naShanghai Research Institute for Intelligent Autonomous Systems, Tongji University\nbShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\ncCollege of Design and Innovation, Tongji University\nAbstract—Large Language Models (LLMs) showcase impres-\nsive capabilities but encounter challenges like hallucination,\noutdated knowledge, and non-transparent, untraceable reasoning\nprocesses. Retrieval-Augmented Generation (RAG) has emerged\nas a promising solution by incorporating knowledge from external\ndatabases. This enhances the accuracy and credibility of the\ngeneration, particularly for knowledge-intensive tasks, and allows\nfor continuous knowledge updates and

In [35]:
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import (
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ContextEntityRecall,
    NoiseSensitivity,
    ResponseRelevancy,
    Faithfulness,
)


embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

# Defining each metric that we wanna see
metrics = {
        "Context Precision": LLMContextPrecisionWithReference(llm=llm),
        "Context Recall": LLMContextRecall(llm=llm),
        "Context Entities Recall": ContextEntityRecall(llm=llm),
        "Noise Sensitivity": NoiseSensitivity(llm=llm),
        "Response Relevancy": ResponseRelevancy(llm=llm,embeddings=OpenAIEmbeddings()),
        "Faithfulness": Faithfulness(llm=llm),
    }

# Define a function to evaluate all metrics for a sample
def evaluate_metrics(sample: SingleTurnSample, metrics:dict):
    # Results dictionary to store the metric values
    results = {}
    # Iterating through the metrics dictionary
    for metric_name, metric in metrics.items():

        try:
            results[metric_name] = metric.single_turn_score(sample)

        except Exception as e:
            results[metric_name] = f"Error: {e}"

    return results

In [36]:
results = evaluate_metrics(sample = sample, metrics = metrics)
results

{'Context Precision': 0.9166666666361111,
 'Context Recall': 1.0,
 'Context Entities Recall': 0.3333333322222222,
 'Noise Sensitivity': 0.6363636363636364,
 'Response Relevancy': 0.9463667521553832,
 'Faithfulness': 1.0}

In [37]:
def evaluate_row(row, metrics):
    """
    Creates a single turn sample for the row
    Evaluates all metrics for it
    Returns a dictionary containing all metrics

    """

    # Create a SingleTurnSample for every row
    sample = SingleTurnSample(
        user_input=row['question'],
        reference=row['ground_truth'],
        retrieved_contexts=row['contexts'],
        response = row['answer']
    )

    # Evaluate metrics for the sample
    results = {}
    for metric_name, metric in metrics.items():
        try:
            results[metric_name] = metric.single_turn_score(sample)
        except Exception as e:
            results[metric_name] = f"Error: {e}"

    return results

# Function to evaluate the entire dataframe of testset

def evaluate_dataframe(df, metrics):
    """
    Iterates through the df test set
    For every row uses evaluate_row function to get result dictionary
    Append each dictionary to a list
    Uses list to create the result dataframe

    """
    results = []
    for _, row in df.iterrows():
        row_results = evaluate_row(row, metrics)
        results.append(row_results)
    return pd.DataFrame(results)

In [38]:
df_evaluation = evaluate_dataframe(df1[:3],metrics = metrics)

In [39]:
df_evaluation

Unnamed: 0,Context Precision,Context Recall,Context Entities Recall,Noise Sensitivity,Response Relevancy,Faithfulness
0,0.0,0.0,0.333333,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0


In [40]:

df1.iloc[:3]

Unnamed: 0,question,answer,contexts,ground_truth
0,What is Retrieval-Augmented Generation (RAG)?,Generated answer for: What is Retrieval-Augmen...,[Retrieved context 1 for: What is Retrieval-Au...,RAG enhances LLMs by incorporating knowledge f...
1,What are the three main paradigms of RAG?,Generated answer for: What are the three main ...,[Retrieved context 1 for: What are the three m...,"The three main paradigms of RAG are Naive RAG,..."
2,What is the primary purpose of RAG in large la...,Generated answer for: What is the primary purp...,[Retrieved context 1 for: What is the primary ...,The primary purpose is to reduce hallucination...


In [41]:
from ragas.testset import TestsetGenerator
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))
generator = TestsetGenerator(llm=llm, embedding_model=embeddings)

In [44]:

testset_size = 10  # Number of samples to generate

testset = generator.generate_with_langchain_docs(chunks, testset_size)

Applying SummaryExtractor:   0%|          | 0/123 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/129 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no attribute 'score'
ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'StringIO' object has no a

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/378 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [43]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/3.1 MB[0m [31m36.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m47.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.11.0


In [45]:
testset

Testset(samples=[TestsetSample(eval_sample=SingleTurnSample(user_input='Who is Yuxi Bi and what role does he play in the context of Retrieval-Augmented Generation?', retrieved_contexts=None, reference_contexts=['1\nRetrieval-Augmented Generation for Large\nLanguage Models: A Survey\nYunfan Gaoa, Yun Xiongb, Xinyu Gao b, Kangxiang Jia b, Jinliu Pan b, Yuxi Bic, Yi Dai a, Jiawei Sun a, Meng\nWangc, and Haofen Wang a,c\naShanghai Research Institute for Intelligent Autonomous Systems, Tongji University\nbShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University\ncCollege of Design and Innovation, Tongji University\nAbstract—Large Language Models (LLMs) showcase impres-\nsive capabilities but encounter challenges like hallucination,\noutdated knowledge, and non-transparent, untraceable reasoning\nprocesses. Retrieval-Augmented Generation (RAG) has emerged\nas a promising solution by incorporating knowledge from external\ndatabases. This enhances the accuracy and 