# 1. Build a knowledge base
 1. Use the following Bangla Book - HSC26 Bangla 1st paper
 2. Proper Pre-Processing & data cleaning for better chunk accuracy
 3. Document Chunking & Vectorize

### Import Need Tools

In [1]:
import pandas as pd
import os
import re
import json
import uuid
import shutil

In [2]:
import tiktoken
import faiss
from typing import Dict, List, Optional, Tuple, Union
from langchain_openai import ChatOpenAI
from base64 import b64decode
from nltk.tokenize import word_tokenize
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.storage import LocalFileStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.retrievers.parent_document_retriever import ParentDocumentRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMListwiseRerank
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.base import Docstore, AddableMixin
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import LLMChain

In [None]:
from dotenv import load_dotenv
from multilingual_pdf2text.pdf2text import PDF2Text
from multilingual_pdf2text.models.document_model.document import Document

### Extract Text from Pdf 

In [None]:
from multilingual_pdf2text.pdf2text import PDF2Text
from multilingual_pdf2text.models.document_model.document import Document

pdf_document = Document(
        document_path='HSC26-Bangla1st-Paper.pdf',
        language='ben'
        )
pdf2text = PDF2Text(document=pdf_document)
content = pdf2text.extract()
print(content)

INFO:multilingual_pdf2text.doc2img.parse_document:Parsing document from pdf to image
INFO:multilingual_pdf2text.ocr.image_to_text:Extracting text from images via OCR


[{'page_number': 1, 'text': 'অপরিচিতা\n\nঅনলাইন ব্যাচ সম্পর্কিত যেকোনো জিজ্ঞাসায়,\n\nকলকরো ৬ 86919\n'}, {'page_number': 2, 'text': "[10150261\nনী '৯এ]খ৬ছ\n\nঅনলাইন ব্যাচ\n10401:\nভ্ শিখনফল\n\n৮ নিম্নবিত্ত ব্যক্তির হঠাৎ বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ করবে।\n\n৮ তৎকালীন সমাজ-সভ্যতা ও মানবতার অবমাননা সম্পর্কে জানতে পারবে।\n\n৮ তৎকালীন সমাজের পণপ্রথার কুপ্রভাব সম্পর্কে জানতে পারবে।\n\n৮ তৎকালে সমাজে ভদ্রলোকের স্বভাববৈশিষ্ট্য সম্পর্কে জ্ঞানলাভ করবে।\n৮ নারী কোমল ঠিক, কিন্তু দুর্বল নয়- কল্যাণীর জীবনচরিত দ্বারা প্রতিষ্ঠিত এই সত্য অনুধাবন করতে\n\nপারবে।\n৮ মানুষ আশা নিয়ে বেঁচে থাকে- অনুপমের দৃষ্টান্তে মানবজীবনের এই চিরন্তন সত্যদর্শন সম্পর্কে\n\nজ্ঞানলাভ করবে।\n\nন্ট প্রাক-মূল্যায়ন\n\n১। অনুপমের বাবা কী করে জীবিকা নির্বাহ করতেন?\n\nক) ডাক্তারি খ) ওকালতি গ) মাস্টারি ঘ) ব্যবসা\n\n২। মামাকে ভাগ্য দেবতার প্রধান এজেন্ট বলার কারণ, তার-\n\nক) প্রতিপত্তি খ) প্রভাব গ) বিচক্ষণতা ঘ) কুট বুদ্ধি\n\nনিচের অনুচ্ছেদটি পড়ে ৩ ও ৪ সংখ্যক প্রশ্নের উত্তর দাও।\n\nপিতৃহীন দীপুর চাচা

In [None]:
# Write to a text file
output_path = 'HSC26-Bangla1st-Paper.txt'

with open(output_path, 'w', encoding='utf-8') as f:
    for page in content:
        page_number = page.get('page_number')
        text = page.get('text', '')
        f.write(f"--- Page {page_number} ---\n{text}\n\n")

print(f"Content written to {output_path}")


Content written to /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/HSC26-Bangla1st-Paper.txt


### Pre-Processing and data Cleaning

In [35]:
QUESTION_SECTION_KEYWORDS = [
    "বহুনির্বাচনী", "নির্বাচনযোগ্য প্রশ্ন", "উত্তর", 
    "প্রশ্ন", "নির্দেশনা", "নির্দেশ", "নমুনা", "উদাহরণ"
]

# Pattern to detect MCQ-style text like: ১। প্রশ্ন... ক) ... খ) ...
MCQ_PATTERN = re.compile(r'\d+\s*[\.।]\s*.+?(?:ক\)|খ\)|গ\)|ঘ\))')


In [36]:
def is_story_element(element):
    text = element.get('text', '').strip()
    if not text:
        return False

    lowered_text = text.lower()

    # 1. Remove question section labels
    if any(keyword in lowered_text for keyword in QUESTION_SECTION_KEYWORDS):
        return False

    # 2. Remove MCQ-style question lines
    if re.match(r"^\d+\s*[।\.]\s*.+", text):
        return False

    # 3. Remove lines with options (ক), (খ), etc.
    if re.search(r"\(ক\)|\(খ\)|\(গ\)|\(ঘ\)", text):
        return False

    # 4. Remove "কোনটি সঠিক?" or similar pattern
    if "কোনটি সঠিক" in lowered_text or "সঠিক উত্তর" in lowered_text:
        return False

    # 5. Remove garbage: lines with too many symbols or digits
    # If line has >60% non-letter characters, consider garbage
    non_letter_ratio = len(re.findall(r"[^অ-ঔক-হাি-ৌৎঁ্য়০-৯a-zA-Z\s]", text)) / len(text)
    if non_letter_ratio > 0.6:
        return False

    # 6. Remove too short label-like lines
    if len(text) < 20 and re.search(r'[ঃ:]', text):
        return False

    return True


In [37]:
def is_paragraph_break(element, prev_element=None):
    text = element.get('text', '').strip()
    metadata = element.get('metadata', {})

    is_empty = not text
    has_newline = metadata.get('newline', False) if metadata else False
    is_new_paragraph = text.startswith((' ', '\t')) or (prev_element and len(prev_element.get('text', '')) > 0 and text)

    return is_empty or has_newline or is_new_paragraph

In [38]:
def chunk_by_paragraph(extracted_content, max_characters=1000, min_characters=200, overlap_characters=100):
    chunks = []
    current_chunk = []
    current_chunk_length = 0
    prev_element = None

    for element in extracted_content:
        if not is_story_element(element):
            continue  # ← Skip non-story content

        text = element.get('text', '').strip()
        if not text:
            continue

        if is_paragraph_break(element, prev_element) and current_chunk:
            chunk_text = " ".join(current_chunk)
            if len(chunk_text) >= min_characters:
                chunks.append({
                    "text": chunk_text,
                    "chunk_id": f"chunk_{len(chunks) + 1}",
                    "source": "story_pdf",
                    "section": "story"
                })
                # Add overlap
                overlap_text = chunk_text[-overlap_characters:] if len(chunk_text) > overlap_characters else chunk_text
                current_chunk = [overlap_text, text]
                current_chunk_length = len(overlap_text) + len(text)
            else:
                current_chunk.append(text)
                current_chunk_length += len(text)
        else:
            current_chunk.append(text)
            current_chunk_length += len(text)

        if current_chunk_length >= max_characters:
            chunk_text = " ".join(current_chunk)
            chunks.append({
                "text": chunk_text,
                "chunk_id": f"chunk_{len(chunks) + 1}",
                "source": "story_pdf",
                "section": "story"
            })
            overlap_text = chunk_text[-overlap_characters:] if len(chunk_text) > overlap_characters else chunk_text
            current_chunk = [overlap_text]
            current_chunk_length = len(overlap_text)

        prev_element = element

    if current_chunk and len(" ".join(current_chunk)) >= min_characters:
        chunks.append({
            "text": " ".join(current_chunk),
            "chunk_id": f"chunk_{len(chunks) + 1}",
            "source": "story_pdf",
            "section": "story"
        })

    return chunks


### Save Chunk

In [39]:
def save_chunks_to_files(chunks, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for chunk in chunks:
        file_name = f"{chunk['chunk_id']}.txt"
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(chunk['text'])
        print(f"✅ Saved: {file_path}")

### Chunking

In [None]:

pdf_path = "HSC26-Bangla1st-Paper.pdf"
output_path = "chunks_24_output"

print(f"\n📄 Processing: {pdf_path}")


pdf_document = Document(document_path=pdf_path, language='ben')
pdf2text = PDF2Text(document=pdf_document)
extracted_content = pdf2text.extract()

# Chunk
chunks = chunk_by_paragraph(
    extracted_content,
    max_characters=500,
    min_characters=100,
    overlap_characters=50
)

# Save
save_chunks_to_files(chunks, output_path)

print(f"\n✅ Total story chunks extracted: {len(chunks)}")

# Preview
for chunk in chunks[:3]:  # show first 3 for check
    print(f"\n🔹 Chunk ID: {chunk['chunk_id']}")
    print(f"📝 Preview: {chunk['text'][:100]}...")

INFO:multilingual_pdf2text.doc2img.parse_document:Parsing document from pdf to image



📄 Processing: HSC26-Bangla1st-Paper.pdf


INFO:multilingual_pdf2text.ocr.image_to_text:Extracting text from images via OCR


✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_1.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_2.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_3.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_4.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_5.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_6.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_7.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_8.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_9.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_10.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/chunks_24_output/chunk_11.txt
✅ Saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilin

### Clean after chunking

In [41]:
def clean_text(text):
    # Remove long digit sequences (2 or more digits)
    text = re.sub(r'\d{5,}', ' ', text)

    # Remove stray special characters except Bengali Unicode range, punctuation, and spaces
    text = re.sub(r'[^ \u0980-\u09FF\u09E6-\u09EF।,.\-]+', ' ', text)

    # Normalize multiple spaces to one
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [43]:
def clean_chunks_to_new_folder(original_folder, clean_folder):
    os.makedirs(clean_folder, exist_ok=True)

    for file_name in os.listdir(original_folder):
        if file_name.endswith('.txt'):
            original_path = os.path.join(original_folder, file_name)

            with open(original_path, 'r', encoding='utf-8') as f:
                text = f.read()

            cleaned_text = clean_text(text)

            clean_path = os.path.join(clean_folder, file_name)
            with open(clean_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)

            print(f"🧹 Cleaned and saved: {clean_path}")


In [None]:
original_chunks_dir = "chunks_24_output"
clean_chunks_dir = "clean_chunks_24_output"

clean_chunks_to_new_folder(original_chunks_dir, clean_chunks_dir)



🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_18.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_2.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_3.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_1.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_4.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_5.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_7.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_6.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multilingual/clean_chunks_24_output/chunk_8.txt
🧹 Cleaned and saved: /Users/mst.sadiakhatun/Desktop/RAG_Multili

### Pdf document corpus in vector database

#### Setup Faiss DB

In [45]:
FAISS_DIR = "./project_rag_24_jul/"
FAISS_INDEX_PATH = os.path.join(FAISS_DIR, "faiss_index")
DOCSTORE_DIR = os.path.join(FAISS_DIR, "docstore")
id_key = "doc_id"

#### Read stored chunk 

In [46]:
def read_text_files(directory_path: str, prefix: str = "", suffix: str = ".txt") -> List[str]:
    texts = []
    for f in sorted(os.listdir(directory_path)):
        if f.startswith(prefix) and f.endswith(suffix):
            with open(os.path.join(directory_path, f), encoding="utf-8") as file:
                texts.append(file.read())
    return texts


#### Load Stored Chunk

In [None]:
text_raw = read_text_files("clean_chunks_24_output", "chunk_")

In [48]:
class DocumentLocalFileStore(LocalFileStore):
    def mset(self, key_value_pairs):
        serialized = []
        for key, doc in key_value_pairs:
            content = {
                "page_content": doc.page_content,
                "metadata": doc.metadata
            }
            serialized.append((key, json.dumps(content).encode("utf-8")))
        super().mset(serialized)

    def mget(self, keys):
        values = super().mget(keys)
        return [
            Document(**json.loads(v.decode("utf-8"))) if v else None
            for v in values
        ]

In [49]:
class PersistentDocstore(Docstore, AddableMixin):
    def __init__(self, file_store: DocumentLocalFileStore):
        self.file_store = file_store

    def add(self, texts: Dict[str, Document]) -> None:
        """Add multiple Documents by their IDs."""
        self.mset(list(texts.items()))

    def delete(self, ids: List[str]) -> None:
        """Delete Documents by their IDs."""
        for doc_id in ids:
            self.file_store.delete(doc_id)

    def search(self, doc_id: str) -> Union[str, Document]:
        """Search a single Document by ID. Returns message if not found."""
        result = self.mget([doc_id])[0]
        return result if result else f"Document with ID '{doc_id}' not found."

    def mget(self, keys: List[str]) -> List[Optional[Document]]:
        """Retrieve multiple Documents by ID."""
        return self.file_store.mget(keys)

    def mset(self, key_value_pairs: List[Tuple[str, Document]]) -> None:
        """Store multiple Documents by ID."""
        self.file_store.mset(key_value_pairs)


#### Initialize FAISS Vector Store and Embedding model

In [50]:

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
sample_embedding = embeddings.embed_query("খাটি")
dim = len(sample_embedding)
index = faiss.IndexFlatL2(dim)
docstore = PersistentDocstore(DocumentLocalFileStore(DOCSTORE_DIR))

# Initialize FAISS Vector Store
vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


#### Initialize Retriever

In [52]:
docstore = DocumentLocalFileStore(DOCSTORE_DIR)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10
)

multi_vector_retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key,
)


In [53]:
llm_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
reranker = LLMListwiseRerank.from_llm(llm_model, top_n=1)

retriever = ContextualCompressionRetriever(
    base_compressor=reranker,
    base_retriever=multi_vector_retriever
)


#### Generate title and metadata 

In [71]:
def safe_extract_json(text: str) -> Optional[str]:
    try:
        start = text.index('{')
        end = text.rindex('}') + 1
        return text[start:end]
    except ValueError:
        return None



##### Title Question Generation Prompt

In [55]:
title_qa_prompt = ChatPromptTemplate.from_template("""
You are an AI assistant in a multilingual Retrieval-Augmented Generation (RAG) system.

Your task is to extract concise metadata from the given content, which may be in English, Bengali, or a mixture of both.

Generate a valid JSON object with these fields:
- "title": A very short English title summarizing the content in a few words.
- "questionanswer": A list of up to 4 short question-answer pairs directly based on the content.

Instructions:
- Focus on extracting short, fact-based questions and answers that a user might ask exactly.
- Each question and answer can be in Bengali or English, matching the content.
- Answers should be brief and precise, ideally a few words or a short sentence.
- Questions and answers must be strictly grounded in the given passage and phrased concisely.
- Do NOT infer, summarize broadly, or hallucinate any information beyond the content.
- If fewer than 4 pairs can be confidently extracted, return only the justified number.
- Output ONLY strictly valid JSON — no markdown, no extra commentary.

Example Output Format:
{{
  "title": "Anupam's Early Life and Family",
  "questionanswer": [
    {{
      "question": "আজ অনুপমের বয়স কত?",
      "answer": "সাতাশ বছর"
    }},
    {{
      "question": "অনুপমের আসল অভিভাবক কে?",
      "answer": "তার মামা"
    }},
    {{
      "question": "অনুপমের মামা বিবাহ সম্পর্কে কী চায়?",
      "answer": "মাথা হেট করা কনে, ধনী নয় কিন্তু কিছু পণ দেবে"
    }},
    {{
      "question": "অনুপম কেন নিজেকে ভালোমানুষ বলে?",
      "answer": "কারণ তামাক খায় না এবং মাতার আদেশ মানে"
    }}
  ]
}}

Content:
{content}
""")


In [56]:
title_qa_chain = title_qa_prompt | llm_model | StrOutputParser()

In [72]:
def generate_title_and_qa(content: str) -> Tuple[str, List[Dict[str, str]]]:
    try:
        raw_response =title_qa_chain.invoke({"content": content}).strip()
        json_text = raw_response if raw_response.startswith('{') else safe_extract_json(raw_response)
        if not json_text:
            raise ValueError("No valid JSON found in LLM output.")

        parsed = json.loads(json_text)
        title = parsed.get("title", "Untitled")
        qas = parsed.get("questionanswer", [])
        return title, qas

    except Exception as error:
        # Replace print with proper logging if desired
        print(f"Error generating title and QA: {error}")
        print(f"Raw response: {raw_response if 'raw_response' in locals() else 'No response'}")
        print(f"Content snippet: {content[:300]}")
        return "Untitled", []


#### Prepare Raw Doc

In [73]:
from langchain.schema import Document

In [74]:

def create_doc_with_metadata(text: str, doc_id: str) -> Document:
    title, qa_pairs = generate_title_and_qa(text)
    metadata = {
        id_key: doc_id,
        "source": "text",
        "title": title,
        "questionanswer": qa_pairs
    }
    return Document(page_content=text, metadata=metadata)

In [75]:
raw_ids = [str(uuid.uuid4()) for _ in range(len(text_raw))]

all_raw_docs = [create_doc_with_metadata(text, doc_id) for text, doc_id in zip(text_raw, raw_ids)]

multi_vector_retriever.vectorstore.add_documents(all_raw_docs)

multi_vector_retriever.docstore.mset(
    [(doc.metadata[id_key], doc) for doc in all_raw_docs]
)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"

In [76]:
vectorstore.save_local(f"{FAISS_DIR}faiss_index")

In [77]:
multi_vector_retriever.invoke(" অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(metadata={'doc_id': '4009b27b-8c58-4d22-a5da-785f352e5dad', 'source': 'text', 'title': "Anupam's Self-Reflection", 'questionanswer': [{'question': 'অনুপমের জীবন কেমন?', 'answer': 'নিতান্তই তুচ্ছ'}, {'question': 'অনুপমের ব্যবহৃত উপমা কী?', 'answer': 'অন্নপূর্ণা অন্নে পরিপূর্ণা'}, {'question': 'দেবী দুর্গার দুই পুত্র কে?', 'answer': 'গণেশ ও কার্তিকেয়'}, {'question': 'ভারতের কোন অঞ্চলের নদী উল্লেখ করা হয়েছে?', 'answer': 'গয়া অঞ্চল'}]}, page_content='অপরিচিতা অনলাইন ব্যাচ সম্পর্কিত যেকোনো জিজ্ঞাসায়, কলকরো ৬ মূল শব্দ এ জীবনটা না দৈর্ঘ্যের হিসাবে বড়ো, না গুণের হিসাবে শব্দার্থ ও টীকা শব্দের অর্থ ও ব্যাখ্যা গল্পের কথক চরিত্র অনুপমের আত্মসমালোচনা। পরিমাণ ও গুণ উভয় দিক দিয়েই যে তার জীবনটি নিতান্তই তুচ্ছ সে কথাই এখানে ব্যক্ত হয়েছে। গুটি এক সময় পূর্ণ ফলে পরিণত হয়। কিন্তু গুটিই যদি ফলের মতো হয় তাহলে তার অসম্পূর্ণ সারবস্তা প্রকট হয়ে ফলের মতো গুটি ওঠে। নিজের নিম্ষল জীবনকে বোঝাতে অনুপমের ব্যবহৃত উপমা। অন্নপূর্ণা অন্নে পরিপূর্ণা। দেবী দুর্গা। দেবী দুর্গার দুই পুত্র অগ্রজ গণেশ ও অন

## Chat

In [78]:
import json
from base64 import b64decode

from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_openai import ChatOpenAI

# Optional: for language detection (pip install langdetect)
from langdetect import detect


In [None]:
def optimize_query(query: str) -> dict:
    system_prompt = """
You are an expert in multilingual query optimization for a RAG (Retrieval-Augmented Generation) pipeline that supports both English and Bengali (Bangla).

Your job is to:
1. Reformulate the original question clearly and naturally in its own language (English or Bengali) for use with a vector-based retriever.
2. Extract concise and essential keywords (nouns, names, concepts) for use with a BM25 retriever. Only include important terms, and return them as a space-separated string — do not translate the language.

Output JSON must follow this format:
{
  "vector_query": "rephrased full question (same language)",
}
Only return valid JSON.
"""

    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content=system_prompt.strip()),
        HumanMessage(content=f"Original Query: {query}")
    ])

    llm = ChatOpenAI(model="gpt-4o-mini")  # or gpt-4, mistral if multilingual
    chain = prompt | llm | StrOutputParser()

    response = chain.invoke({"query": query})
    
    try:
        result = json.loads(response)
        return {
            "vector_query": result.get("vector_query", query).strip(),
        }
    except json.JSONDecodeError:
        print("[Warning] LLM did not return valid JSON. Falling back to original query.")
        return {
            "vector_query": query,
        }


In [112]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
pd_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    id_key=id_key,
)


In [113]:
def combine_retrievers(query):
    optimized_queries = optimize_query(query)
    vector_query = optimized_queries["vector_query"]

    mv_docs = multi_vector_retriever.invoke(vector_query)
    pd_docs = pd_retriever.invoke(vector_query)
    seen = set()
    combined_docs = []
    for doc in mv_docs + pd_docs :
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            combined_docs.append(doc)

    return combined_docs


In [84]:
def parse_docs(docs):
    """Split base64-encoded images and plain text."""
    b64 = []
    text = []
    for doc in docs:
        if not isinstance(doc, Document):
            print(f"Skipping non-document: {type(doc)}")
            continue
        try:
            b64decode(doc.page_content)
            b64.append(doc.page_content)
        except Exception:
            text.append(doc)
    return {"images": b64, "texts": text}


In [85]:
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "en"

In [None]:

def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]
    lang = detect_lang(user_question)

    # Prepare the full context string with source tagging
    context_text = ""
    for text_element in docs_by_type["texts"]:
        source = text_element.metadata.get("source", "unknown")
        context_text += f"[Source: {source}]\n{text_element.page_content}\n\n"

    # Updated prompt encouraging short, precise answers
    prompt_template = f"""
You are an intelligent assistant answering questions in Bengali or English based only on the provided context.

Instructions:
- Give short and direct answers .
- Do NOT explain unless absolutely necessary.
- Focus on accuracy — especially for names, numbers, and definitions.
- Use only the context below — no assumptions, no hallucination.
- Answer in the same language as the question ("{lang}").

Context:
{context_text}

Question:
{user_question}

Answer:
"""

    prompt_content = [{"type": "text", "text": prompt_template}]

    return ChatPromptTemplate.from_messages([
        HumanMessage(content=prompt_content)
    ])


In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)

chain = (
    {
        "context": RunnableLambda(combine_retrievers) | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | llm
    | StrOutputParser()
    
)


In [None]:
# def hyde_generate_hypothetical_answer(llm, query):
#     prompt = f"Write a detailed answer to the question: '{query}'"
#     return llm.invoke(prompt)


In [None]:
# def combine_retrievers(query, llm):
#     # Step 1: Generate hypothetical document using HyDE
#     hypothetical_answer = hyde_generate_hypothetical_answer(llm, query)
    
#     # Step 2: Optimize query (optional)
#     optimized_queries = optimize_query(query)
#     vector_query = optimized_queries.get("vector_query", query)
    
#     # Step 3: Use the hypothetical answer as the vector query
#     mv_docs = multi_vector_retriever.invoke(hypothetical_answer)
#     pd_docs = pd_retriever.invoke(vector_query)  # You can also try hypothetical_answer here
    
#     # Step 4: Combine results without duplication
#     seen = set()
#     combined_docs = []
#     for doc in mv_docs + pd_docs:
#         if doc.page_content not in seen:
#             seen.add(doc.page_content)
#             combined_docs.append(doc)

#     return combined_docs


In [126]:
query = "অনুপমের বাবা কী করে জীবিকা নির্বাহ করতেন?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: ওকালতি করে।


In [98]:
query = "কাকে অনুপমের ভাগ্য দেবতা বলা হয়েছে?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: মামা


In [127]:
query = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: প্রদানকৃত তথ্য অনুযায়ী, কল্যাণীর প্রকৃত বয়স উল্লেখ করা হয়নি।


In [128]:
query = "বিয়ের সময় কল্যাণীর প্রকৃত বয়স কত ছিল?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: ১৬ বা ১৭ বছর


In [129]:
query = "অনুপমের মামা বিবাহ সম্পর্কে কী চায়?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: মেয়ের গহনা খুলে আনতে চায়।


In [130]:
query = "অনুপেমর ভাষায় সুপুরুষ কােক বলা হেয়েছ?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: কল্যাণী


In [117]:
query = "মামা কেন অবাক হলেন?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: গাড়ি না দেওয়ায়।


In [118]:
query = "বক্তার বয়স কত?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: বক্তার বয়স উল্লেখিত নয়।


In [119]:
query = "রবীন্দ্রনাথ ঠাকুর কত বছর বয়সে ব্যারিস্টারি পড়তে ইংল্যান্ড গিয়েছিলেন?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: ১৭ বছর


In [120]:
query = "অপরিচিতা গল্পের নারীর নাম কী?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: কল্যাণী


In [121]:
# List of 30 Bengali questions from the "অপরিচিতা" story
questions = [
    "‘অপরিচিতা’ গল্পে কোন দ্বীপের উল্লেখ আছে?",
    "কে কন্যাকে আশীর্বাদ করতে গেল?",
    "বিনোদিনার সাথে অনুপ্রেমের সম্পর্ক কী?",
    "নরম নয় রে, খাঁটি সোনার বাটি। উক্তিটি কার?",
    "বিনোদিনার 'চক্ষুশূল' এর শ্বশুরকে কি বলে?",
    "কল্যাণীর পিতার নাম কী?",
    "শশাঙ্ক বাবুর বয়স কত?",
    "'তাহার বিনোদিনা অপ্রিয় নয়’- কার?",
    "কাঞ্চীপাথর নিয়ে কে বলেছিল?",
    "‘এ্যারিয়ং’ কোথা থেকে আনা হয়েছে?",
    "অনুপম কাকে নিয়ে থিয়েটার শুরু করে?",
    "মা-ছেলের থিয়েটার বাহন কী ছিল?",
    "‘অনুপমের কোল গলানোর ছোট ভাইটি’ এখানে ‘ছোট - ভাইটি’ বলতে কাকে বোঝানো হয়েছে?",
    "‘এখানে জায়গা আছে’ উক্তিটি কার?",
    "স্টেশনে অনুপম কী ফেলে গেল?",
    "ট্রেন দেখা হওয়ার সময় কল্যাণীর বয়স কত ছিল?",
    "অপরিচিতা মেয়েটির সাথে কতোজন মেয়ে ছিল?",
    "ট্রেনের স্টেশনে হতে কী খাওয়া কিনে নেয়?",
    "শশাঙ্ক পেশায় কী ছিলেন?",
    "বিয়ের সময় অনুপমের বয়স কত ছিল?",
    "গলাপানির মায়ের নাম কী?",
    "‘পিপিরির চলে আয়, এই গাড়িতে জায়গা আছে’ উক্তিটি কার?",
    "হরিশ কী উপলক্ষে কল্যাণার এসেছে?",
    "কাকে অনুপমের ভাষা দেবতা বলে উল্লেখ করা হয়েছে?",
    "টাকার প্রতি আসক্তি কার?",
   
]

# Run each question through your RAG chain
for i, query in enumerate(questions, 1):
    result = chain.invoke(query)
    print(f"{i}. প্রশ্ন: {query}")
    print(f"   উত্তর: {result}\n")


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


1. প্রশ্ন: ‘অপরিচিতা’ গল্পে কোন দ্বীপের উল্লেখ আছে?
   উত্তর: আন্ডামান দ্বীপ



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


2. প্রশ্ন: কে কন্যাকে আশীর্বাদ করতে গেল?
   উত্তর: শস্তুনাথ



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


3. প্রশ্ন: বিনোদিনার সাথে অনুপ্রেমের সম্পর্ক কী?
   উত্তর: অনুপ্রেমের বিরহ।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


4. প্রশ্ন: নরম নয় রে, খাঁটি সোনার বাটি। উক্তিটি কার?
   উত্তর: বিনুদাদার।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


5. প্রশ্ন: বিনোদিনার 'চক্ষুশূল' এর শ্বশুরকে কি বলে?
   উত্তর: শস্তুনাথবাবু



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


6. প্রশ্ন: কল্যাণীর পিতার নাম কী?
   উত্তর: শস্তুনাথ সেন।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


7. প্রশ্ন: শশাঙ্ক বাবুর বয়স কত?
   উত্তর: সাতাশ বছর।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


8. প্রশ্ন: 'তাহার বিনোদিনা অপ্রিয় নয়’- কার?
   উত্তর: শস্তুনাথবাবুর



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


9. প্রশ্ন: কাঞ্চীপাথর নিয়ে কে বলেছিল?
   উত্তর: কাঞ্চীপাথর নিয়ে কিছু বলা হয়নি।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


10. প্রশ্ন: ‘এ্যারিয়ং’ কোথা থেকে আনা হয়েছে?
   উত্তর: প্রশ্নে উল্লেখ নেই।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


11. প্রশ্ন: অনুপম কাকে নিয়ে থিয়েটার শুরু করে?
   উত্তর: কল্যাণী।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


12. প্রশ্ন: মা-ছেলের থিয়েটার বাহন কী ছিল?
   উত্তর: গাড়ি



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


13. প্রশ্ন: ‘অনুপমের কোল গলানোর ছোট ভাইটি’ এখানে ‘ছোট - ভাইটি’ বলতে কাকে বোঝানো হয়েছে?
   উত্তর: অনুপমের ছোট ভাই।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


14. প্রশ্ন: ‘এখানে জায়গা আছে’ উক্তিটি কার?
   উত্তর: মেয়েটির



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


15. প্রশ্ন: স্টেশনে অনুপম কী ফেলে গেল?
   উত্তর: টিকিট



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


16. প্রশ্ন: ট্রেন দেখা হওয়ার সময় কল্যাণীর বয়স কত ছিল?
   উত্তর: বয়সের উল্লেখ নেই।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


17. প্রশ্ন: অপরিচিতা মেয়েটির সাথে কতোজন মেয়ে ছিল?
   উত্তর: দুটি-তিনটি ছোটো মেয়ে।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"




INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


18. প্রশ্ন: ট্রেনের স্টেশনে হতে কী খাওয়া কিনে নেয়?
   উত্তর: চানা-মুঠ



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


19. প্রশ্ন: শশাঙ্ক পেশায় কী ছিলেন?
   উত্তর: উকিল



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


20. প্রশ্ন: বিয়ের সময় অনুপমের বয়স কত ছিল?
   উত্তর: বয়স উল্লেখিত হয়নি।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


21. প্রশ্ন: গলাপানির মায়ের নাম কী?
   উত্তর: শস্তুনাথ সেন



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


22. প্রশ্ন: ‘পিপিরির চলে আয়, এই গাড়িতে জায়গা আছে’ উক্তিটি কার?
   উত্তর: মেয়েটির



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


23. প্রশ্ন: হরিশ কী উপলক্ষে কল্যাণার এসেছে?
   উত্তর: বিবাহ উপলক্ষে।



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


24. প্রশ্ন: কাকে অনুপমের ভাষা দেবতা বলে উল্লেখ করা হয়েছে?
   উত্তর: মামা



INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


25. প্রশ্ন: টাকার প্রতি আসক্তি কার?
   উত্তর: মামার অস্থিমজ্জায় জড়িত।



In [123]:
query = "অনুপমের মামা সম্পর্কে কী কী তথ্য পাওয়া যায়?"
result = chain.invoke(query)
print("Answer:", result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: মামা বিবাহ-বাড়িতে খুশি নন, গহনা পরীক্ষা করেন।


## Evalutaion