In [None]:
import easyocr
import fitz  # PyMuPDF
from tqdm import tqdm

# Initialize EasyOCR reader for Bengali
reader = easyocr.Reader(['bn'])

# Open the PDF
pdf_document = fitz.open("HSC26-Bangla1st-Paper.pdf")

# Initialize a variable to store the extracted text
pdf_text = ""

# Iterate through each page of the PDF with a progress bar
for page_num in tqdm(range(pdf_document.page_count), desc="Processing Pages"):
    page = pdf_document.load_page(page_num)
    pix = page.get_pixmap()
    img = pix.tobytes("png")
    result = reader.readtext(img)
    for (_, text, _) in result:
        pdf_text += text + "\n"

# Print the extracted text
print(pdf_text)


In [None]:
file_path = "extracted_text.txt"

with open(file_path, "w", encoding="utf-8") as file:
    file.write(pdf_text)
file_path

'extracted_text.txt'

In [None]:
import re
from bnunicodenormalizer import Normalizer

def clean_bangla_text(text):
    UNWANTED_STRINGS = [
        "অনলাহন ব্যাচ বাংলা - ইংরেজি : আইসিটি বাংলা ম পত্র আলোচ্য বিষয় অপরিচিতা গনবগি য্যাচ সম্পকিত মেকোনো জিঞাসায় কল করো",
        "অনলাইন ব্যাঢচ",
        "গচ",
        "]03#8য{"
    ]

    PUNCTUATION_AND_SYMBOLS = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]'
    CONTENT_IN_BRACKETS = [r"\[.*?\]", r"\{.*?\}", r"\(.*\)"]
    OTHER_ARTIFACTS = [r"[a-zA-Z]+", r"\d+"]
    WORD_MAP = {
        "বল্যণী": "কল্যাণী", "অলুপম": "অনুপম", "মালুষ": "মানুষ", "লিম্নবিত্": "নিম্নবর্ণিত",
        "শিখনফল": "শিক্ষণফল", "লিম্নবিত": "নিম্নবর্ণিত", "শম্তূলাপ": "শম্ভুনাথ", "তিল": "ছিল",
        "ঢিল": "ছিল", "কিল্রু": "কিন্তু", "যখল": "যখন", "তখল": "তখন", "হটবে": "হবে",
        "আতে": "আছে", "লাই": "নাই", "লিয়ে": "নিয়ে", "যাল": "যান", "হইয়া": "হয়ে",
        "গিয়াছে": "গিয়েছে", "গেল": "গেল", "करिता": "करता", "করিয়া": "করে"
    }

    for junk in UNWANTED_STRINGS:
        text = text.replace(junk, ' ')

    for pat in CONTENT_IN_BRACKETS:
        text = re.sub(pat, ' ', text)

    all_artifacts_to_remove = [PUNCTUATION_AND_SYMBOLS] + OTHER_ARTIFACTS
    for pat in all_artifacts_to_remove:
        text = re.sub(pat, ' ', text)

    bn_normalizer = Normalizer()
    try:
        normalized_text = bn_normalizer(text)["normalized"]
        if normalized_text:
             text = normalized_text
    except Exception as e:
        print(f"Could not fully normalize text, proceeding without it. Error: {e}")

    for wrong, correct in WORD_MAP.items():
        text = re.sub(r'\b' + wrong + r'\b', correct, text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text

try:
    with open('extracted_text.txt', 'r', encoding='utf-8') as f:
        raw_text = f.read()

    cleaned_text = clean_bangla_text(raw_text)

    with open('final_cleaned_bangla_text.txt', 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

    print("✅ Successfully cleaned the text.")
    print("   Cleaned data has been saved to 'final_cleaned_bangla_text.txt'")

except FileNotFoundError:
    print("❌ Error: The input file 'extracted_text.txt' was not found.")
    print("   Please make sure the file is in the same directory as the script.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

In [72]:
import hashlib
import unicodedata

In [None]:
def generate_stable_id(text: str) -> str:
    """Generates a stable MD5 hash ID for a given text string."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def clean_bangla_text_simple(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)

    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[^\u0980-\u09FF\s.,?!ঃ]', '', text)
    text = text.replace('্ ', '্')

    return text.strip()

In [74]:
input_filename = 'extracted_text.txt'
output_filename = 'simple_cleaned_bangla_text.txt'

try:
    print(f"▶️  Reading raw text from '{input_filename}'...")
    with open(input_filename, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    print("⚙️  Starting the simple cleaning process...")
    cleaned_text = clean_bangla_text_simple(raw_text)

    print(f"💾  Saving cleaned text to '{output_filename}'...")
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

    print("\n✅ Successfully cleaned the text using the simple method.")
    print(f"   Cleaned data has been saved to '{output_filename}'")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")


▶️  Reading raw text from 'extracted_text.txt'...
⚙️  Starting the simple cleaning process...
💾  Saving cleaned text to 'simple_cleaned_bangla_text.txt'...

✅ Successfully cleaned the text using the simple method.
   Cleaned data has been saved to 'simple_cleaned_bangla_text.txt'


In [124]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("simple_cleaned_bangla_text.txt",encoding="utf-8")
documents = loader.load()

In [125]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Chunking

In [126]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_documents(documents)

In [127]:
len(texts)

928

# Store into FAISS database

In [79]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Initialize Chroma vector store
vector_store = FAISS.from_documents(texts, embeddings)

In [81]:
vector_store.index_to_docstore_id

{0: 'fdea05a2-1ca5-4067-816b-8cd4fd7a2578',
 1: 'c3234fa8-e22c-4073-8d42-d1fd21596bf1',
 2: '7335e1be-c703-4fd7-9668-4b6a628cf8cd',
 3: 'fe7b061c-3425-4179-af3d-fd0dc4bbfbb4',
 4: 'eaa84ac5-d94b-4444-97c2-e619b806b810',
 5: '415889d3-b7f8-4488-9f30-e40b44f27807',
 6: '3100c731-013e-4d1a-bd4f-b5437846292c',
 7: 'ff6975b3-ff46-4ef7-ae66-c23f7c950edb',
 8: '00928c66-2e5d-42b8-82e6-6150441df10a',
 9: '78ad171b-1b91-4cd6-bce0-ab325bd591c7',
 10: '4d7b9484-b5e2-437c-a8a6-b94ea5f156e4',
 11: '4fb0dcf4-94ad-490d-b4dc-36228ee2ccd8',
 12: '70ec9c2b-f964-4eb0-b419-893b124775c9',
 13: 'a670eb1d-0ffd-493d-9cf6-0d7f5d446414',
 14: '5b8ec52b-5ba5-4983-ae46-c242e1bf775e',
 15: '8da4fed2-e023-4773-8c09-28438e9daa2f',
 16: 'e6c04e65-8109-431d-9996-5afd4342fc7c',
 17: '1fcfa01a-40f2-4940-8f3d-d9eb72f612c4',
 18: '0f59e087-2c46-4005-8df8-a9f1ba1fa4cb',
 19: '5b4f7c3b-9a44-46ee-9790-e6ac87f79e50',
 20: 'bb79aa85-fea1-4c25-bc8d-677d10a7ec6b',
 21: 'e6e74180-de47-4246-9bd9-38fe5eee08d3',
 22: '082ef0ed-0b49-

In [83]:
vector_store.get_by_ids(["403332a5-c965-4b2b-b2dd-373a0f6d5fd3"])

[Document(id='403332a5-c965-4b2b-b2dd-373a0f6d5fd3', metadata={'source': 'simple_cleaned_bangla_text.txt'}, page_content='দায়ী সুতরাং যথাথ টমবার্া মেযোটি বল্যাোণী ডবনের এসতি দত্তব্যাট উসে পালিয়ে সনতবাট')]

# Adding retreiver

In [84]:
retreiver = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Prompt template & chain

In [85]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [86]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.2)

In [116]:
prompt = PromptTemplate(
    template="""
    You are a helpful and intelligent assistant for a Retrieval-Augmented Generation (RAG) system.

You will receive user questions in either Bangla or English.
Use **only** the retrieved context provided below to answer the question accurately.
If the answer is not explicitly available, try to infer it **reasonably** from the context.
If no answer can be found or inferred, respond with:
- "দুঃখিত, এই প্রশ্নের উত্তর পাওয়া যায়নি।" (for Bangla questions), or
- "Sorry, I couldn't find the answer to your question." (for English questions).

Always respond **in the same language** as the user's question.

Context to search through:
{context}

Question: {question}

Instructions:
- If the question can be answered by **a single name, phrase, number, or entity**, return the **shortest possible answer** — ideally in **1–3 words**, no extra sentence.
- If the question **requires explanation or description**, then return a **clear full sentence**.
- Never include unnecessary repetition of question terms in the answer.
- Focus on precision and brevity.
- Always respond **in the same language** as the user's question.

Answer:
"""
)

In [88]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [117]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

chain = (
    {"context": retreiver | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [118]:
response = chain.invoke("অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?")

print(response)

মামা


In [119]:
response = chain.invoke("অনপমের বাবা কী করে জীবিকা নির্বাহ করতেন?")

print(response)

ডাক্তারি


# Now Langserve

In [130]:
from fastapi import FastAPI
from langserve import add_routes
import uvicorn

In [131]:
app = FastAPI(
    title="LangChain Server",
    version="1.0",
    description="A simple api server using Langchain's Runnable interfaces",
)

In [132]:
add_routes(
    app,
    chain,
    path="/openai",
)

# Traditional

In [122]:
from langchain.chains import LLMChain

traditional_chain = LLMChain(
    llm=llm,
    prompt=prompt
)

traditional_chain.predict(
    question="অনুপমের ভাষায় সুপুরুষ কাকে বলা হয়েছে?",
    context=""
)

'পিতা'