Load Libraries

In [1]:
from llama_index.core import (SimpleDirectoryReader,Settings,VectorStoreIndex)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
import re
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


Load data

In [2]:
documents=SimpleDirectoryReader("data",recursive=True).load_data()
for doc in documents:
    if "science" in doc.metadata["file_path"].lower():
        doc.metadata["subject"] = "science"
    elif "english" in doc.metadata["file_path"].lower():
        doc.metadata["subject"] = "english"

In [3]:
print("Number of documents    :", len(documents))

print(documents[0].text[:300])
print(documents[0].metadata)

Number of documents    : 324
Let us Recite
1  Papa’s Spectacles
Today our papa
Is searching everywhere, 
He has lost his spectacles,
It was his only pair!
He can’t read the newspaper,
He can’t watch TV,
He can’t cut the vegetables,
Oh! Where can it be?
He checked inside his pockets,
He glanced under his chair,
He searched near 
{'page_label': '1', 'file_name': 'eesa101.pdf', 'file_path': 'd:\\banus works\\Education_rag_chatbot\\data\\english\\eesa101.pdf', 'file_type': 'application/pdf', 'file_size': 2740653, 'creation_date': '2025-07-23', 'last_modified_date': '2026-01-12', 'subject': 'english'}


Chunking

In [4]:
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=50)


Embedding

Load embed model

In [5]:
Settings.embed_model=HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")


2026-01-14 15:39:34,949 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Vector Store

In [6]:
chroma_client=chromadb.PersistentClient(path="./chroma_data")

#load collections
collection_name="education"
chroma_collection=chroma_client.get_or_create_collection(name=collection_name)

#wrap chroma for llamaindex
vector_store=ChromaVectorStore(chroma_collection=chroma_collection)

print(chroma_collection.count())

2026-01-14 15:39:40,343 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


0


Build the Index

In [7]:
storage_context=StorageContext.from_defaults(vector_store=vector_store)

In [8]:
index=VectorStoreIndex.from_documents(documents,storage_context=storage_context)

In [9]:

print("Ingestion Completed")
print("Stored Vectors :", chroma_collection.count())

Ingestion Completed
Stored Vectors : 327


Hugging Face LLM setup

In [10]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

Load chroma as a Langchain vector store

use same same embedding model as ingestion

In [11]:
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

2026-01-14 15:39:47,973 - INFO - Use pytorch device_name: cpu
2026-01-14 15:39:47,973 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Load existing chroma DB

In [12]:
vectorstore=Chroma(persist_directory="./chroma_data",
                   collection_name="education",
                   embedding_function=embeddings)

  vectorstore=Chroma(persist_directory="./chroma_data",


In [13]:
print ("Index loaded successfully....")
print(vectorstore._collection.count())

Index loaded successfully....
327


create a retriever

In [14]:
n_retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":4}
)

testing

In [15]:
docs=n_retriever.invoke("What is the poem Papa's Spectacles about?")
for d in docs:
    print(d.page_content[:200])
    print(d.metadata)
    print("------")

Let us Recite
1  Papa’s Spectacles
Today our papa
Is searching everywhere, 
He has lost his spectacles,
It was his only pair!
He can’t read the newspaper,
He can’t watch TV,
He can’t cut the vegetable
{'_node_type': 'TextNode', 'creation_date': '2025-07-23', 'doc_id': '30880a02-96c2-4a18-889b-c7319dea183c', 'ref_doc_id': '30880a02-96c2-4a18-889b-c7319dea183c', 'page_label': '1', 'file_type': 'application/pdf', '_node_content': '{"id_": "57aeac6e-7fbc-489c-861d-3d1e2e32f8ba", "embedding": null, "metadata": {"page_label": "1", "file_name": "eesa101.pdf", "file_path": "d:\\\\banus works\\\\Education_rag_chatbot\\\\data\\\\english\\\\eesa101.pdf", "file_type": "application/pdf", "file_size": 2740653, "creation_date": "2025-07-23", "last_modified_date": "2026-01-12", "subject": "english"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", 

Load LLM

In [16]:
from langchain_huggingface import ChatHuggingFace
from langchain_huggingface.llms import HuggingFaceEndpoint

llm_endpoint=HuggingFaceEndpoint(repo_id="meta-llama/Llama-3.2-3B-Instruct",
                   task="conversational",
                   temperature=0
                   )
llm=ChatHuggingFace(llm=llm_endpoint)

In [17]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt= ChatPromptTemplate.from_template(
    """You are a helpful teacher for 5th grade English and Science.

Use the following context to answer the question.
If the answer is not in the context, say you don't know.

Context:
{context}

Question:
{question}

Answer in simple, clear language:
"""
)

create RAG Chain

In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [19]:
rag_chain=(
    {"context":n_retriever | format_docs,
     "question":RunnablePassthrough()}
     | prompt
     | llm
     | StrOutputParser()
)

In [20]:
rag_chain.invoke("What is the poem Papa’s Spectacles about?")


'The poem "Papa\'s Spectacles" is about a father who loses his glasses and can\'t do things like read, watch TV, or even cut his vegetables. The speaker in the poem finds the glasses on the father\'s head and tells him that they are right there all along.'

In [21]:
rag_chain.invoke("why do some food spoil faster")

'Some food spoils faster because of the tiny living things called microbes that can grow on it. Just like how we need water, air, and right temperatures to survive, microbes need these same things to grow and make food spoil.\n\nWhen microbes grow in the right conditions (like moisture, air, and the right temperature), they can multiply quickly and make the food spoil faster. This is why some food items last for weeks, and others spoil in just a few days.'

Subject-wise retriever

In [22]:
english_retriever=vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k":4,
        "filter":{"subject":"english"}
    }
)

science_retriever=vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={
        "k":4,
        "filter":{"subject":"science"}
    }
)

In [23]:
def get_rag_chain(subject:str):
    if subject=="english":
        retriever=english_retriever
    elif subject=="science":
        retriever=science_retriever
    else:
        retriever=n_retriever

    return (
        {
            "context":retriever | format_docs,
            "question" : RunnablePassthrough()
        }
        | prompt
        | llm
        | StrOutputParser()
    )

In [24]:
rag=get_rag_chain("english")
rag.invoke("What is the poem Papa’s Spectacles about?")

'The poem "Papa\'s Spectacles" is about Papa who has lost his only pair of spectacles. He\'s unable to perform various tasks because he can\'t see, and his daughter helps him find them.'

In [25]:
rag=get_rag_chain("science")
rag.invoke("why do some food spoil faster")

"Some foods spoil faster because of microbes that are already present in the food, like bacteria and yeast. These microbes need water, air, and the right temperature to grow. If the food is exposed to these conditions, the microbes will multiply quickly and start breaking down the food, causing it to spoil.\n\nThink of it like this: if you leave spoiled food out at room temperature, it's like giving a growing plant the perfect conditions to grow. The plant will start to grow really fast and take over the space, right? Similarly, microbes will multiply quickly and take over the food, making it spoil faster.\n\nOn the other hand, some foods last for a long time because they are made in ways that stop the microbes from growing. For example, when you dry food like chillies, you're taking away the water that microbes need to grow. Or, when you make foods like pickles or ghee, you're using methods that kill the microbes or make it hard for them to grow.\n\nSo, it's like a balance between the

In [27]:
rag=get_rag_chain("english")
rag.invoke("Which game does the poem Gilli Danda mention?")

'The game mentioned in the poem is called "Gilli Danda". To play this game, you need two things: \n\n1. A short stick (gilli) \n2. A long stick (danda)\n\nIn the poem, it also mentions a small gilli to be thrown with the danda.'