In [None]:
# LangChain + Chroma + embeddings
!pip install langchain langchain-community chromadb openai python-dotenv

# Embeddings from HuggingFace
!pip install sentence-transformers

# SpaCy and the English pipeline
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

In [13]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import spacy

# LangChain loaders and processing
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import SpacyTextSplitter

In [8]:

load_dotenv()
API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise ValueError("Set DEEPSEEK_API_KEY in your .env!")

client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")
MODEL = "deepseek-chat"



In [33]:
# Load and chunk document
loader = TextLoader("docs/sample.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Embed and persist to Chroma vector store
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(chunks, embedding_model, persist_directory="chroma_store")

# Prompt user for a query
query = input("Ask a question based on the document: ")

# Retrieve top-k chunks
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
relevant_docs = retriever.invoke(query)  # Updated method per deprecation warning

# Print retrieved chunks
print("\n🔍 Retrieved Chunks:\n")
for i, doc in enumerate(relevant_docs, 1):
    print(f"--- Chunk {i} ---\n{doc.page_content}\n")

# Assemble context from retrieved chunks
context = "\n\n".join([doc.page_content for doc in relevant_docs])

# Use strict instruction to enforce retrieval-only answering
prompt = f"""You are a helpful assistant. ONLY use the following context to answer the question.
If the answer is not in the context, reply: 'Not found in the context.'

Context:
{context}

Question:
{query}
"""

# Call DeepSeek
response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": prompt}],
    max_tokens=1024
)

# Display final answer
print(query)
print("\n🧠 DeepSeek Answer:")
print(response.choices[0].message.content)


🔍 Retrieved Chunks:

--- Chunk 1 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 2 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 


🔍 Retrieved Chunks:

--- Chunk 1 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 2 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 3 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

Where is Aslam from?

🧠 DeepSeek Answer:
Not found in the context.


#above is failing due to bad chunking or embedding

In [15]:
loader = TextLoader("docs/sample.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500,
    separators=["\n\n", "\n", ".", " ", ""]
)
chunks = splitter.split_documents(docs)

# ————— 3. Embed & persist to Chroma —————
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)

# ————— 4. Ask the user for a query —————
query = input("Ask a question based on the document: ")

# ————— 5. Retrieve a larger set of chunks —————
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
relevant_docs = retriever.invoke(query)

# ————— 6. Print retrieved chunks for debugging —————
print("\n🔍 Retrieved Chunks:\n")
for idx, doc in enumerate(relevant_docs, start=1):
    print(f"--- Chunk {idx} ---\n{doc.page_content}\n")

# ————— 7. Assemble context & build prompt —————
context = "\n\n".join([doc.page_content for doc in relevant_docs])
prompt = f"""You are a helpful assistant. ONLY use the following context to answer the question.
If the answer is not in the context, reply: 'Not found in the context.'

Context:
{context}

Question:
{query}
"""

# ————— 8. Call DeepSeek API and display answer —————
response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": prompt}],
    max_tokens=1024
)

print(f"\n🧠 Answer to: {query}")
print(response.choices[0].message.content)


🔍 Retrieved Chunks:

--- Chunk 1 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 2 ---
After some time, Aslam emerged—walking oddly, face blank. He didn’t recognize us, didn’t speak, just walked away. Then, behind him, followed a tall, skinny, filthy man. None of us had ever seen him. A horrifying smell came from him. We froze.

We ran toward them but both vanished. We searched but found nothing. Word spread. Aslam was found unconscious at the edge of the orchard. His right fingers were gone—eaten or melted. Blood mixed with mud. We took him home.

--- Chunk 

In [16]:
loader = TextLoader("docs/sample.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500,
    separators=["\n\n", "\n", ".", " ", ""]
)
chunks = splitter.split_documents(docs)

# ————— 3. Embed & persist to Chroma —————
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)

# ————— 4. Ask the user for a query —————
query = input("Ask a question based on the document: ")

# ————— 5. Retrieve a larger set of chunks —————
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
relevant_docs = retriever.invoke(query)

# ————— 6. Print retrieved chunks for debugging —————
print("\n🔍 Retrieved Chunks:\n")
for idx, doc in enumerate(relevant_docs, start=1):
    print(f"--- Chunk {idx} ---\n{doc.page_content}\n")

# ————— 7. Assemble context & build prompt —————
context = "\n\n".join([doc.page_content for doc in relevant_docs])
prompt = f"""You are a helpful assistant. ONLY use the following context to answer the question.
If the answer is not in the context, reply: 'Not found in the context.'

Context:
{context}

Question:
{query}
"""

# ————— 8. Call DeepSeek API and display answer —————
response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": prompt}],
    max_tokens=1024
)

print(f"\n🧠 Answer to: {query}")
print(response.choices[0].message.content)


🔍 Retrieved Chunks:

--- Chunk 1 ---
Grandpa’s friend was from a village in Nadia’s Bagula. India had been free for three years. Grandpa had gotten a job in Kolkata in a textile mill. One day, Grandpa and a few others decided to visit this friend’s ancestral home during Kali Puja. Grandpa, originally from across the border (now Bangladesh), hadn’t seen much of this side of Bengal. He was eager.

--- Chunk 2 ---
Grandpa’s friend was from a village in Nadia’s Bagula. India had been free for three years. Grandpa had gotten a job in Kolkata in a textile mill. One day, Grandpa and a few others decided to visit this friend’s ancestral home during Kali Puja. Grandpa, originally from across the border (now Bangladesh), hadn’t seen much of this side of Bengal. He was eager.

--- Chunk 3 ---
Grandpa’s friend was from a village in Nadia’s Bagula. India had been free for three years. Grandpa had gotten a job in Kolkata in a textile mill. One day, Grandpa and a few others decided to visit this fri

In [17]:
loader = TextLoader("docs/sample.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500,
    separators=["\n\n", "\n", ".", " ", ""]
)
chunks = splitter.split_documents(docs)

# ————— 3. Embed & persist to Chroma —————
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)

# ————— 4. Ask the user for a query —————
query = input("Ask a question based on the document: ")

# ————— 5. Retrieve a larger set of chunks —————
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
relevant_docs = retriever.invoke(query)

# ————— 6. Print retrieved chunks for debugging —————
print("\n🔍 Retrieved Chunks:\n")
for idx, doc in enumerate(relevant_docs, start=1):
    print(f"--- Chunk {idx} ---\n{doc.page_content}\n")

# ————— 7. Assemble context & build prompt —————
context = "\n\n".join([doc.page_content for doc in relevant_docs])
prompt = f"""You are a helpful assistant. ONLY use the following context to answer the question.
If the answer is not in the context, reply: 'Not found in the context.'

Context:
{context}

Question:
{query}
"""

# ————— 8. Call DeepSeek API and display answer —————
response = client.chat.completions.create(
    model=MODEL,
    messages=[{"role": "user", "content": prompt}],
    max_tokens=1024
)

print(f"\n🧠 Answer to: {query}")
print(response.choices[0].message.content)


🔍 Retrieved Chunks:

--- Chunk 1 ---
Word spread.

Aslam was found unconscious at the edge of the orchard.

His right fingers were gone—eaten or melted.

Blood mixed with mud.

We took him home.



He regained consciousness at dawn but never spoke again.

His tongue was stiff and black.



“Then?

What happened next?”

my cousin and I asked.



Grandpa said: “We took Aslam back to Kolkata.

He was from Dinajpur.

We arranged to send him home.

His parents were devastated.

We had no face to show.

--- Chunk 2 ---
Word spread.

Aslam was found unconscious at the edge of the orchard.

His right fingers were gone—eaten or melted.

Blood mixed with mud.

We took him home.



He regained consciousness at dawn but never spoke again.

His tongue was stiff and black.



“Then?

What happened next?”

my cousin and I asked.



Grandpa said: “We took Aslam back to Kolkata.

He was from Dinajpur.

We arranged to send him home.

His parents were devastated.

We had no face to show.

--- Chunk 3 --