## 1. Installing Python Packages

In [None]:
# Install required packages
!pip install -q langchain langchain-community python-dotenv
!pip install -q langchain-openai chromadb tiktoken

## 2. Setup OpenAI Key

#### Load the openai api key and set the constants.

In [None]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') # You can directly set your keys

# Constants
LLM = "gpt-3.5-turbo"
ADV_LLM = "gpt-4"
LLM_TEMP = 0
EMBEDDING_MODEL = "text-embedding-3-small"

## 3. Imports

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from typing import List
from pydantic import BaseModel, Field



## 4. Fetch documents from web and create store in vector store

#### This block sets up your searchable knowledge base, converting raw articles into structured vector chunks that can later be searched semantically when a user asks a question.

In [None]:
# You can also use "text-embedding-3-large" for higher quality
embedding_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)
# Docs to index
urls = [
    "https://curicious.com/5-healthy-foods-to-eat-for-breakfast-for-energy/",
    "https://curicious.com/fun-facts-about-balanced-diet/",
    "https://curicious.com/vietnam-travel-guide-for-indian-visitors/",
    "https://curicious.com/i-thought-electrical-vehicles-are-the-future-but-this-proved-me-wrong/"
]

# Load
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)


### 4.1 Load documents chunks into vector store

In [None]:
# Add to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag",
    embedding=embedding_model,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4},  # number of documents to retrieve
)

## 5. Query & Retrieve Docs

#### This code retrieves the most relevant document for the question “What is a healthy breakfast?” using a semantic retriever and prints its title, source, and content. It enables question-aware document lookup from the vector store.

In [None]:
question = "What is a healthy breakfast ?"
docs = retriever.invoke(question)
print(f"Title: {docs[0].metadata['title']}\n\nSource: {docs[0].metadata['source']}\n\nContent: {docs[0].page_content}\n")

Title: 5 Healthy Foods to Eat Breakfast for Energy | Healthy Eating Recipes

Source: https://curicious.com/5-healthy-foods-to-eat-for-breakfast-for-energy/

Content: you feeling full.Almond butter is high in vitamin E and magnesium.Peanut butter provides abundant protein and potassium.Cashew butter is a good source of iron and zinc.You can make this meal even better with banana slices, chia seeds, or fresh berries. These toppings add nutrition and new flavors. Your breakfast becomes even healthier and more delicious.Adding whole grain toast with nut butters to your mornings is easy and smart. It fits any schedule, whether you’re in a rush or have time to relax. This combo gives you steady energy and many health benefits. It’s one of the best breakfast options for staying healthy.ConclusionStarting your day with the right food is key to success. This article showed you healthy breakfast ideas. Each offers nutrients to get your metabolism going and keep your energy up.Oatmeal, Greek yogu

## 6. Document Relevance Grading - Semantic Level filtering

#### Vector retrieval is not always accurate, this does semantic post filtering. Poorly matched docs reduce the answer quality, we discard irrelavant ones and only keep the useful context

In [None]:
# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )


# LLM with function call - using OpenAI
llm = ChatOpenAI(model=LLM, temperature=LLM_TEMP)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system =
"""You are a grader assessing relevance of a retrieved document to a user question. \n
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

## 7. LLM-Based Grounding

#### This code filters the retrieved documents by checking their relevance to the question using an LLM-based grader. Only documents with a binary_score of "yes" are kept in docs_to_use for further processing.

In [None]:
docs_to_use = []
for doc in docs:
    print(doc.page_content, '\n', '-'*50)
    res = retrieval_grader.invoke({"question": question, "document": doc.page_content})
    print(res,'\n')
    if res.binary_score == 'yes':
        docs_to_use.append(doc)

you feeling full.Almond butter is high in vitamin E and magnesium.Peanut butter provides abundant protein and potassium.Cashew butter is a good source of iron and zinc.You can make this meal even better with banana slices, chia seeds, or fresh berries. These toppings add nutrition and new flavors. Your breakfast becomes even healthier and more delicious.Adding whole grain toast with nut butters to your mornings is easy and smart. It fits any schedule, whether you’re in a rush or have time to relax. This combo gives you steady energy and many health benefits. It’s one of the best breakfast options for staying healthy.ConclusionStarting your day with the right food is key to success. This article showed you healthy breakfast ideas. Each offers nutrients to get your metabolism going and keep your energy up.Oatmeal, Greek yogurt, and eggs are great choices. They balance taste and health. For those in a rush, smoothies and whole grain toast with nut butter are fast yet filling.These breakfa

## 8. LLM writes grounded answers from trusted sources

#### This code uses an LLM to generate a concise answer to the user's question based on the filtered, relevant documents, formatted as a structured RAG (Retrieval-Augmented Generation) pipeline.

In [None]:
# Prompt
system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge.
Use three-to-five sentences maximum and keep the answer concise."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved documents: \n\n \
        <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
    ]
)
# LLM - using OpenAI
llm = ChatOpenAI(model=LLM, temperature=LLM_TEMP)
# Post-processing
def format_docs(docs):
    return "\n".join(f"<doc{i+1}>:\nTitle:{doc.metadata['title']}\
    \nSource:{doc.metadata['source']}\nContent:{doc.page_content}\
    \n</doc{i+1}>\n" for i, doc in enumerate(docs))
# Chain
rag_chain = prompt | llm | StrOutputParser()
# Run
generation = rag_chain.invoke({"documents":format_docs(docs_to_use), "question": question})
print(generation)

A healthy breakfast is one that provides a balance of proteins, fibers, and healthy fats to kickstart your day with energy and nutrition. Examples include oatmeal topped with fruits and nuts, Greek yogurt with berries, eggs, smoothies made with fruits, veggies, and proteins, and whole grain toast with nut butters. These options help maintain stable blood sugar levels, keep you full longer, and support overall well-being. A nutritious breakfast is essential for consistent energy levels and improved focus throughout the day.


## 9. Hallucination Detection in LLM Answers

#### This code uses an LLM-based grader to evaluate whether the generated answer is factually grounded in the retrieved documents, returning a simple "yes" or "no" to flag potential hallucinations.

In [None]:
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in 'generation' answer."""
    binary_score: str = Field(
        ...,
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )
# LLM with function call - using OpenAI
llm = ChatOpenAI(model=LLM, temperature=LLM_TEMP)
structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system =
"""You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""

hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n <facts>{documents}</facts> \
        \n\n LLM generation: <generation>{generation}</generation>"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader

response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

binary_score='yes'


## 10. Traceability: Linking Answer Back to Source Segments [Explainability & Groundedness]

#### This step uses an advanced LLM (e.g., GPT-4) to extract exact verbatim segments from source documents that were directly used in forming the generated answer — enabling full traceability and explainability.

In [None]:

# Data model
class HighlightDocuments(BaseModel):
    """Return the specific part of a document used for answering the question."""

    id: List[str] = Field(
        ...,
        description="List of id of docs used to answers the question"
    )

    title: List[str] = Field(
        ...,
        description="List of titles used to answers the question"
    )

    source: List[str] = Field(
        ...,
        description="List of sources used to answers the question"
    )

    segment: List[str] = Field(
        ...,
        description="List of direct segements from used documents that answers the question"
    )

# LLM - using OpenAI
llm = ChatOpenAI(model=ADV_LLM, temperature=LLM_TEMP)  # Using GPT-4 for better parsing accuracy

# parser
parser = PydanticOutputParser(pydantic_object=HighlightDocuments)

# Prompt
system = """You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to
generate the given answer.
The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text
in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't used the specific document don't mention it.

Used documents: <docs>{documents}</docs> \n\n User question: <question>{question}</question> \n\n
Generated answer: <answer>{generation}</answer>

<format_instruction>
{format_instructions}
</format_instruction>
"""

prompt = PromptTemplate(
    template= system,
    input_variables=["documents", "question", "generation"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Chain
doc_lookup = prompt | llm | parser

# Run
lookup_response = doc_lookup.invoke({"documents":format_docs(docs_to_use), "question": question, "generation": generation})

## 11. Final Response

In [None]:
for id, title, source, segment in zip(lookup_response.id, lookup_response.title, \
                                      lookup_response.source, lookup_response.segment):
    print(f"ID: {id}\nTitle: {title}\nSource: {source}\nText Segment: {segment}\n")

ID: doc1
Title: 5 Healthy Foods to Eat Breakfast for Energy | Healthy Eating Recipes
Source: https://curicious.com/5-healthy-foods-to-eat-for-breakfast-for-energy/
Text Segment: Oatmeal, Greek yogurt, and eggs are great choices. They balance taste and health. For those in a rush, smoothies and whole grain toast with nut butter are fast yet filling.

ID: doc2
Title: 5 Healthy Foods to Eat Breakfast for Energy | Healthy Eating Recipes
Source: https://curicious.com/5-healthy-foods-to-eat-for-breakfast-for-energy/
Text Segment: Oatmeal: A Wholesome Start to Your DayAs someone who loves breakfast meal planning, I include oatmeal often. It’s flexible and packed with health perks. This grain is high in fiber, which is great for the heart and feeling full. It has changed my mornings, giving me energy and clear thinking all day.

ID: doc4
Title: 5 Healthy Foods to Eat Breakfast for Energy | Healthy Eating Recipes
Source: https://curicious.com/5-healthy-foods-to-eat-for-breakfast-for-energy/
Tex