In [None]:
%pip install pdf2image pytesseract pdfplumber langchain-community transformers torch faiss-cpu sentence-transformers

In [None]:
import logging, re
from pathlib import Path

import torch
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline

logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("pdfplumber").setLevel(logging.ERROR)
logging.getLogger("pdf2image").setLevel(logging.ERROR)

if torch.cuda.is_available():
    DEVICE, DEV_ID = "cuda", 0
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    DEVICE, DEV_ID = "mps", 0
else:
    DEVICE, DEV_ID = "cpu", -1

print("Using device:", DEVICE)


Using device: mps


In [None]:
def extract_text_ocr(pdf_path: Path, lang: str = "hin") -> str:
    pages = convert_from_path(str(pdf_path), dpi=300)
    texts = []
    for i, img in enumerate(pages, 1):
        txt = pytesseract.image_to_string(img, lang=lang)
        texts.append(txt)
    return "\n".join(texts)

In [None]:
data_dir = Path("./data")
ocr_dir  = Path("ocr_texts")
ocr_dir.mkdir(exist_ok=True)

pdf_hi = data_dir/"Constitution_Hindi.pdf"
txt_hi = ocr_dir/"Constitution_Hindi.txt"
if pdf_hi.exists() and not txt_hi.exists():
    text = extract_text_ocr(pdf_hi, lang="hin")
    txt_hi.write_text(text, encoding="utf-8")

pdf_ipc = data_dir/"IPC_hindi.pdf"
txt_ipc = ocr_dir/"IPC_hindi.txt"
if pdf_ipc.exists() and not txt_ipc.exists():
    text = extract_text_ocr(pdf_ipc, lang="hin")
    txt_ipc.write_text(text, encoding="utf-8")

for name in ["Constitution_Hindi.txt","IPC_hindi.txt"]:
    src = data_dir/name
    dst = ocr_dir/name
    if src.exists() and not dst.exists():
        dst.write_text(src.read_text("utf-8"), encoding="utf-8")

In [None]:
constitution_hindi = (ocr_dir/"Constitution_Hindi.txt").read_text(encoding="utf-8")
ipc_hindi          = (ocr_dir/"IPC_hindi.txt").read_text(encoding="utf-8")


In [None]:
docs = [
    Document(page_content=constitution_hindi, metadata={"source":"Constitution_Hindi.txt"}),
    Document(page_content=ipc_hindi,        metadata={"source":"IPC_hindi.txt"}),
]
print("Total Hindi source docs:", len(docs))


Total Hindi source docs: 2


In [None]:
def clean_text(text: str) -> str:
    text = text.replace("\r\n","\n").replace("\r","\n")
    text = re.sub(r'\n{2,}', '\n\n', text)
    text = re.sub(r'[^\S\n]+',' ', text)
    return text.strip()

splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
hindi_chunks = []
for doc in docs:
    cleaned = clean_text(doc.page_content)
    for i, chunk in enumerate(splitter.split_text(cleaned)):
        m = dict(doc.metadata); m["chunk"]=i
        hindi_chunks.append(Document(page_content=chunk, metadata=m))
print("Total Hindi chunks:", len(hindi_chunks))


Total Hindi chunks: 926


In [None]:
embed_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
hf_embed   = HuggingFaceEmbeddings(model_name=embed_model)

faiss_dir = Path("faiss_hindi_index")
if faiss_dir.exists():
    vectordb = FAISS.load_local(str(faiss_dir), hf_embed)
    print("Loaded existing FAISS index.")
else:
    print("Building FAISS index…")
    vectordb = FAISS.from_documents(hindi_chunks, hf_embed)
    vectordb.save_local(str(faiss_dir))
    print("Built & saved FAISS index.")


  hf_embed   = HuggingFaceEmbeddings(model_name=embed_model)


Building FAISS index…
Built & saved FAISS index.


In [25]:
model_id = "bigscience/mt0-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)

dtype = torch.float16 if DEVICE in ("cuda","mps") else torch.float32
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=dtype).to(DEVICE)

hindi_pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7,
    device=DEVICE_ID,
    do_sample=True,
    top_p=0.9,
)
llm = HuggingFacePipeline(pipeline=hindi_pipe)
print(f"Loaded {model_id} on {DEVICE}")


Device set to use mps:0


Loaded bigscience/mt0-small on mps


In [28]:
combine_chain = load_qa_chain(llm=llm, chain_type="stuff")
qa_chain = RetrievalQA(
    combine_documents_chain=combine_chain,
    retriever=vectordb.as_retriever(search_kwargs={"k":4}),
    return_source_documents=True,
)
print("Hindi RetrievalQA chain ready")


Hindi RetrievalQA chain ready


In [None]:
def run_hindi_qa(question: str, verbose: bool=True):
    prompt = f"Answer in Hindi: {question}"
    out = qa_chain({"query": prompt})
    ans = out["result"]
    srcs = out["source_documents"]
    if verbose:
        print("\n>>> प्रश्न:\n", question)
        print("\n>>> उत्तर:\n", ans)
        print("\n>>> स्रोत (chunks):")
        for d in srcs:
            m = d.metadata
            snip = d.page_content.replace("\n"," ")[:200]
            print(f" • {m['source']} (chunk {m['chunk']}) → {snip}…")
    return ans, srcs


In [27]:
_ = run_hindi_qa("भारतीय संविधान का अनुच्छेद 370 क्या कहता है?")
_ = run_hindi_qa("कृत्य या उपेक्षा तत्व क्या हैं?")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



>>> प्रश्न:
 भारतीय संविधान का अनुच्छेद 370 क्या कहता है?

>>> उत्तर:
 संविधान के अनुच्छेद 370 के खंड (1) के साथ पठित अनुच्छेद 370 के खंड (3) द्वारा प्रदत्त शक्तियों का प्रयोग करते हुए राष्ट्रपति ने जम्मू-कश्मीर के महाराजा की 5 मार्च, 1948 की उद्घोषणा के अधीन तत्समय पदस्थ मंत्रि-परिषद् की सलाह पर कार्य करने वाले जम्मू-कश्मीर के राजपाल के लिए निर्देशों को शामिल करता हुआ माना जाएगा ।

>>> स्रोत (chunks):
 • Constitution_Hindi.txt (chunk 406) → जम्मू-कश्मीर राज्य को लागू होंगे ।"। (परिशिष्ट 3 देखें) । 1. भारत का संविधान की खंड (3) द्वारा प्रदत्त शक्तियों का प्रयोग करते हुए राष्ट्रपति ने जम्मू-कश्मीर राज्य की संविधान सभा की सिफारिश पर यह घोषण…
 • Constitution_Hindi.txt (chunk 467) → 2. संविधान (सोलहवां संशोधन) अधिनियम, 1963 की धारा 5 द्वारा (5-10-1963 से) प्ररूप 3 के स्थान पर प्रतिस्थापित । (तीसरी अनुसूची) “मैं, अमुक, जो राज्य सभा (या लोक सभा) में स्थान भरने के लिए अभ्यर्थी के रू…
 • Constitution_Hindi.txt (chunk 468) → [मैं भारत की प्रभुता और अखंडता अक्षुण्ण रखूंगा,] तथा मैं सम्यक् प्रका