In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdfs(pdf_directory: str):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files to process")

    # Map each filename to a human-readable act name
    act_name_map = {
        "pharmacy.pdf": "Nepal Pharmacy Council Act, 2057",
        "immunization.pdf": "Immunization Act, 2072",
        "single_women.pdf": "Single Women Act",          # adjust to actual title if needed
        "constitution.pdf": "Constitution of Nepal",
        "sports.pdf": "Sports Act",                      # adjust to actual title
    }

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            # Choose act_name if we have it, otherwise use file stem
            act_name = act_name_map.get(pdf_file.name, pdf_file.stem)
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"
                doc.metadata["act_name"] = act_name
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")
        except Exception as e:
            print(f"Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Call this to load all PDFs in ../data/pdf (from the notebook folder)
all_pdf_documents = process_all_pdfs("../data/pdf")

Found 5 PDF files to process

Processing: immunization.pdf
Loaded 14 pages

Processing: constitution.pdf
Loaded 183 pages

Processing: sports.pdf
Loaded 12 pages

Processing: single_women.pdf
Loaded 7 pages

Processing: pharmacy.pdf
Loaded 13 pages

Total documents loaded: 229


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-07-03T14:05:23+05:45', 'author': 'Shivaraj Mr. Poudel', 'moddate': '2024-07-03T14:05:23+05:45', 'source': '../data/pdf/immunization.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1', 'source_file': 'immunization.pdf', 'file_type': 'pdf', 'act_name': 'Immunization Act, 2072'}, page_content='www.lawcommission.gov.np \n1 \n \nखोप ऐन, २०७२ \nप्रमाणीकरण र प्रकाशन मममि \n                     २०७२।१०।१२ \nसंशोधन गने ऐन \nनेपालको संविधान अनुकू ल बनाउन के ही नेपाल ऐनलाई संशोधन  \nगने ऐन, २०७५                   २०७५।११।१९ \n \nसंिि् २०७२ सालको ऐन नं. १३ \nखोप सेिा सम्बन्धमा व्यिस्था गनन बनेको ऐन \nप्रस्िािना : खोपको माध्यमबाट रोगको रोकथाम, मनयन्रण, मनिारण िा उन्मूलन गरी शशशु, बाल, मािृ \nिथा अन्य व्यशिको मुत्यु दर घटाउन िथा खोप सेिाको विकास, विस्िार र सुदृढीकरण गरी \nगुणस्िरीय खोप सेिा प्रदान गने सम्बन्धमा आिश्यक व्यिस्था गनन िाञ्छनीय भएकोले, \nनेपालको संविधानको धारा २९६ को उपधारा

In [4]:
import re
from langchain_community.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_all_acts():
    """
    Load all law PDFs and attach act_name and source_file metadata.
    """
    docs = []

    # List all PDFs you want to include, with human-readable act names
    pdf_specs = [
        {"filename": "pharmacy.pdf",      "act_name": "Nepal Pharmacy Council Act, 2057"},
        {"filename": "immunization.pdf",  "act_name": "Immunization Act, 2072"},
        {"filename": "single_women.pdf",  "act_name": "Single Women Act"},        
        {"filename": "constitution.pdf",  "act_name": "Constitution of Nepal"},
        {"filename": "sports.pdf",        "act_name": "Sports Act"},              
    ]

    for spec in pdf_specs:
        pdf_path = f"../data/pdf/{spec['filename']}"
        print(f"Loading {spec['filename']} ...")
        try:
            loader = PyPDFLoader(pdf_path)
            pdf_docs = loader.load()
            for d in pdf_docs:
                d.metadata["act_name"] = spec["act_name"]
                d.metadata["source_file"] = spec["filename"]
                d.metadata["file_type"] = "pdf"
            docs.extend(pdf_docs)
            print(f"  Loaded {len(pdf_docs)} pages")
        except Exception as e:
            print(f"  Error loading {spec['filename']}: {e}")

    print(f"Loaded {len(docs)} documents (pages) from PDFs")
    return docs

# Text splitting into chunks + adding section_number metadata
def split_documents(documents, chunk_size=800, chunk_overlap=150):
    # Try to split first on "धारा", then on newlines, etc.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
            "\nधारा",   # split at section headings if present
            "\n\n",
            "\n",
            "। ",       # sentence end
            " "
        ],
    )

    split_docs = text_splitter.split_documents(documents)

    # Optionally extract section/धारा number into metadata if present
    for doc in split_docs:
        text = doc.page_content
        match = re.search(r"धारा\s*([०१२३४५६७८९0-9]+)", text)
        if match:
            doc.metadata["section_number"] = match.group(1)

    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    if split_docs:
        print("\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [None]:
raw_docs = load_all_acts()
print("Total pages loaded:", len(raw_docs))
print("First doc metadata:", raw_docs[0].metadata)

Loading pharmacy.pdf ...
  Loaded 13 pages
Loading immunization.pdf ...
  Loaded 14 pages
Loading single_women.pdf ...
  Loaded 7 pages
Loading constitution.pdf ...
  Loaded 183 pages
Loading sports.pdf ...
  Loaded 12 pages
Loaded 229 documents (pages) from PDFs
Total pages loaded: 229
First doc metadata: {'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-07-03T14:03:51+05:45', 'author': 'Windows User', 'moddate': '2024-07-03T14:03:51+05:45', 'source': '../data/pdf/pharmacy.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'act_name': 'Nepal Pharmacy Council Act, 2057', 'source_file': 'pharmacy.pdf', 'file_type': 'pdf'}


In [None]:
import re
import unicodedata

def clean_nepali_text(text: str) -> str:
    
    text = unicodedata.normalize("NFC", text)

   
    bad_chars = ["\uf0a7", "\uf0b7", "\uf0d8", "\uf0e5", "\uf022"]  
    for ch in bad_chars:
        text = text.replace(ch, " ")

    
    text = re.sub(r"[^\u0900-\u097F\s।,;:?!०-९0-9\-–]", " ", text)

    
    text = re.sub(r"\s+", " ", text).strip()
    return text


for d in chunked_docs:
    d.page_content = clean_nepali_text(d.page_content)

In [None]:
chunks=split_documents(all_pdf_documents)
chunks

Split 229 documents into 587 chunks

Example chunk:
Content: www.lawcommission.gov.np 
1 
 
खोप ऐन, २०७२ 
प्रमाणीकरण र प्रकाशन मममि 
                     २०७२।१०।१२ 
संशोधन गने ऐन 
नेपालको संविधान अनुकू ल बनाउन के ही नेपाल ऐनलाई संशोधन  
गने ऐन, २०७५           ...
Metadata: {'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-07-03T14:05:23+05:45', 'author': 'Shivaraj Mr. Poudel', 'moddate': '2024-07-03T14:05:23+05:45', 'source': '../data/pdf/immunization.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1', 'source_file': 'immunization.pdf', 'file_type': 'pdf', 'act_name': 'Immunization Act, 2072', 'section_number': '२९६'}


[Document(metadata={'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-07-03T14:05:23+05:45', 'author': 'Shivaraj Mr. Poudel', 'moddate': '2024-07-03T14:05:23+05:45', 'source': '../data/pdf/immunization.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1', 'source_file': 'immunization.pdf', 'file_type': 'pdf', 'act_name': 'Immunization Act, 2072', 'section_number': '२९६'}, page_content='www.lawcommission.gov.np \n1 \n \nखोप ऐन, २०७२ \nप्रमाणीकरण र प्रकाशन मममि \n                     २०७२।१०।१२ \nसंशोधन गने ऐन \nनेपालको संविधान अनुकू ल बनाउन के ही नेपाल ऐनलाई संशोधन  \nगने ऐन, २०७५                   २०७५।११।१९ \n \nसंिि् २०७२ सालको ऐन नं. १३ \nखोप सेिा सम्बन्धमा व्यिस्था गनन बनेको ऐन \nप्रस्िािना : खोपको माध्यमबाट रोगको रोकथाम, मनयन्रण, मनिारण िा उन्मूलन गरी शशशु, बाल, मािृ \nिथा अन्य व्यशिको मुत्यु दर घटाउन िथा खोप सेिाको विकास, विस्िार र सुदृढीकरण गरी \nगुणस्िरीय खोप सेिा प्रदान गने सम्बन्धमा आिश्यक व्यिस्था गनन िाञ्छनीय भएकोले, \nनेपालको संव

In [None]:
TextLoader

langchain_community.document_loaders.text.TextLoader

In [6]:
#embedding and vectorstore db
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List,Dict,Any,Tuple,Optional
from sklearn.metrics.pairwise import cosine_similarity
import torch

In [7]:



class EmbeddingManager:
    def __init__(
        self,
        model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        device: Optional[str] = None,
        normalize_embeddings: bool = True,
        batch_size: int = 32,
    ):
        """
        Embedding manager for multilingual (including Nepali) legal text.

        - model_name: sentence-transformers model ID
        - device: "cuda", "cpu", or None (auto-detect)
        - normalize_embeddings: if True, L2-normalize embeddings (good for cosine similarity)
        - batch_size: how many texts to encode per batch
        """
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.normalize_embeddings = normalize_embeddings
        self.batch_size = batch_size
        self.model: SentenceTransformer | None = None
        self.load_model()
    
    def load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name} on {self.device}")
            self.model = SentenceTransformer(self.model_name, device=self.device)
            dim = self.model.get_sentence_embedding_dimension()
            print(f"Model loaded successfully. Embedding dimension: {dim}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if self.model is None:
            raise ValueError("Model not loaded")
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=self.normalize_embeddings,
        )
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings


# create global instance
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2 on cpu
Model loaded successfully. Embedding dimension: 768


<__main__.EmbeddingManager at 0x1583cbe00>

In [8]:
import os
import uuid
from typing import Any, List

import chromadb
import numpy as np


class VectorStore:
    def __init__(
        self,
        collection_name: str = "pdf_documents_v2",  # new name to avoid mixing old embeddings
        persist_directory: str = "../data/vector_store",
        reset: bool = False,  # if True, delete existing collection on init
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self.reset = reset
        self._initialize_store()
    
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Optionally drop old collection (if you are rebuilding from scratch)
            if self.reset:
                try:
                    self.client.delete_collection(self.collection_name)
                    print(f"Deleted existing collection: {self.collection_name}")
                except Exception:
                    # If it doesn't exist yet, ignore
                    pass

            # Use cosine distance since we normalized embeddings
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={
                    "description": "PDF document embeddings for RAG (Nepali law)",
                    "hnsw:space": "cosine",  # important if you want cosine similarity
                },
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(getattr(doc, "metadata", {}))
            metadata["doc_index"] = i
            metadata["content_length"] = len(getattr(doc, "page_content", ""))
            metadatas.append(metadata)

            documents_text.append(getattr(doc, "page_content", ""))
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text,
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise


# create a fresh store; use reset=True once when rebuilding
vectorstore = VectorStore(reset=True)
vectorstore

Deleted existing collection: pdf_documents_v2
Vector store initialized. Collection: pdf_documents_v2
Existing documents in collection: 0


<__main__.VectorStore at 0x1583cbb60>

In [None]:
# 1) Load acts from PDFs
raw_docs = load_all_acts()          
chunked_docs = split_documents(raw_docs)

# 2) Embed chunks with the new EmbeddingManager
texts = [d.page_content for d in chunked_docs]
embeddings = embedding_manager.generate_embeddings(texts)

# 3) Add to the new vector store
vectorstore.add_documents(chunked_docs, embeddings)

Loading pharmacy.pdf ...
  Loaded 13 pages
Loading immunization.pdf ...
  Loaded 14 pages
Loading single_women.pdf ...
  Loaded 7 pages
Loading constitution.pdf ...
  Loaded 183 pages
Loading sports.pdf ...
  Loaded 12 pages
Loaded 229 documents (pages) from PDFs
Split 229 documents into 587 chunks

Example chunk:
Content: www.lawcommission.gov.np 
1 
 
नेपाल फार्मेसी परिषद् ऐन, २०५७ 
लालर्मोहि ि प्रकाशन मर्ममि 
                     
          २०५७।१०।१८ 
संशोधन गने ऐन                                          प्रर्माणीक...
Metadata: {'producer': 'Microsoft® Word 2013', 'creator': 'Microsoft® Word 2013', 'creationdate': '2024-07-03T14:03:51+05:45', 'author': 'Windows User', 'moddate': '2024-07-03T14:03:51+05:45', 'source': '../data/pdf/pharmacy.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1', 'act_name': 'Nepal Pharmacy Council Act, 2057', 'source_file': 'pharmacy.pdf', 'file_type': 'pdf'}
Generating embeddings for 587 texts...


Batches: 100%|██████████| 19/19 [00:14<00:00,  1.27it/s]


Generated embeddings with shape: (587, 768)
Adding 587 documents to vector store...
Successfully added 587 documents to vector store
Total documents in collection: 587


In [10]:
import re

def contains_devanagari(text: str) -> bool:
    """Return True if any Devanagari characters are present."""
    return bool(re.search(r'[\u0900-\u097F]', text))


def normalize_to_nepali(query: str, llm) -> str:
    """
    If query is Romanized Nepali (Latin script), convert it to Nepali (Devanagari).
    If it's already in Devanagari, or clearly English, return as-is.
    """
    if contains_devanagari(query):
        return query

    prompt = f"""तलको इनपुट रोमन नेपाली (Latin script मा लेखिएको नेपाली) पनि हुन सक्छ,
वा अरू भाषा (जस्तै English) पनि हुन सक्छ।

तपाईंको काम:
- यदि इनपुट स्पष्ट रूपमा रोमन नेपाली छ भने, त्यसलाई सही नेपाली (देवनागरी) मा रूपान्तरण गर्नुहोस्।
- यदि इनपुट रोमन नेपाली होइन (जस्तै pure English प्रश्न) छ भने, त्यसलाई जस्ताको तस्तै फर्काउनुहोस्।
- कुनै पनि व्याख्या, अगाडि/पछि extra शब्दहरू नलेख्नुहोस्।
- केवल रूपान्तरण गरिएको वा original वाक्य मात्र आउटपुट गर्नुहोस्।

केही उदाहरण:
- "immunization act kaile aayeko ?" -> "खोप ऐन, २०७२ कहिले आएको हो ?"
- "pharmacy council le ke vanxa?" -> "फार्मेसी काउन्सिलले के भन्छ ?"
- "yo ain kahile lagu bhayo?" -> "यो ऐन कहिले लागू भयो ?"

प्रयोगकर्ताको इनपुट:
{query}

आउटपुट (केवल वाक्य):
"""
    resp = llm.invoke(prompt)
    return resp.content.strip()


def choose_where(query: str, norm_query: str) -> dict | None:
    """
    Decide which PDF to search based on keywords.
    Returns a dict for Chroma 'where' filter, or None to search all.
    """
    text = (query + " " + norm_query).lower()

    # Pharmacy related
    if any(word in text for word in ["pharmacy", "pharmasi", "फार्मेसी"]):
        return {"source_file": "pharmacy.pdf"}

    # Immunization / खोप related
    if any(word in text for word in ["immunization", "khop", "खोप", "इम्युनाइजेशन"]):
        return {"source_file": "immunization.pdf"}

    # Constitution related
    if any(word in text for word in ["constitution", "संविधान"]):
        return {"source_file": "constitution.pdf"}

    # Single women / widows (एकल महिला)
    if any(word in text for word in ["single women", "single woman", "एकल महिला", "विधवा"]):
        return {"source_file": "single_women.pdf"}

    # Sports related
    if any(word in text for word in ["sports", "खेलकुद", "खेल"]):
        return {"source_file": "sports.pdf"}

    # Default: search all PDFs
    return None


def rag_simple(query: str, retriever, llm, top_k: int = 6) -> str:
    # 0) Normalize Roman Nepali -> Devanagari
    norm_query = normalize_to_nepali(query, llm)
    print("Original query:", query)
    print("Normalized query:", norm_query)

    where = choose_where(query, norm_query)
    print("Using where filter:", where)

    # 1) First try: retrieve with normalized Nepali
    results = retriever.retrieve(norm_query, top_k=top_k, where=where)

    # 1b) Fallback: if no results, try original query
    if not results:
        print("No results with normalized query, trying original query...")
        results = retriever.retrieve(query, top_k=top_k, where=where)

    if not results:
        return "सहित सन्दर्भ (context) फेला परेन, त्यसैले म जवाफ दिन सक्दिन।"

    # 2) Build context (truncate if too long)
    max_chars = 3500
    context_parts = []
    current_len = 0
    for doc in results:
        text = doc["content"]
        if current_len + len(text) > max_chars:
            break
        context_parts.append(text)
        current_len += len(text)
    context = "\n\n".join(context_parts)

    prompt = f"""तपाईं नेपाली कानुन बुझ्ने कानुनी सहायक हुनुहुन्छ। तल दिइएको सन्दर्भ 
फार्मेसी, खोप, संविधान, एकल महिला, खेलकुद लगायतका नेपाली कानून तथा नीतिहरूबाट 
लिइएको हो।

नियम:
- केवल सन्दर्भमा स्पष्ट रूपमा लेखिएको आधारमा मात्र जवाफ दिनुहोस्।
- यदि सोधिएको कुरा सन्दर्भमा स्पष्ट रूपमा छैन भने, जवाफमा लेख्नुहोस्:
  "मलाई थाहा छैन। यो जानकारी दिइएको सन्दर्भमा छैन।"
- आफ्नै अनुमान नगर्नुहोस्, अन्य सामान्य ज्ञान प्रयोग नगर्नुहोस्।
- सकेसम्म दफा नम्बर, परिच्छेद वा शीर्षकको नाम उल्लेख गर्नुहोस्।
- जवाफ छोटो तर ठोस, कानुनी रूपमा ठीक र नेपाली भाषामा दिनुहोस्।

सन्दर्भ:
{context}

प्रश्न (प्रयोगकर्ताको मूल इनपुट):
{query}

अन्तर्रूप (normalize) गरिएको प्रश्न:
{norm_query}

जवाफ नेपाली भाषामा:
"""

    response = llm.invoke(prompt)
    return response.content.strip()

In [11]:
from typing import Any, Dict, List, Optional

class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(
        self,
        query: str,
        top_k: int = 5,
        where: Optional[dict] = None,  # <-- NEW PARAM
    ) -> List[Dict[str, Any]]:
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top k: {top_k}, where: {where}")

        # 1) Generate embedding for query
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        retrieved_docs: List[Dict[str, Any]] = []

        try:
            # 2) Query the collection (pass where filter)
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                where=where,  # <-- USE where HERE
            )
            # print("Raw results from vector store:", results)  # optional debug

            # 3) Process results if there are any
            if results and results.get("documents") and results["documents"][0]:
                documents = results["documents"][0]
                metadatas = results["metadatas"][0]
                distances = results["distances"][0]
                ids = results["ids"][0]

                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    retrieved_docs.append({
                        "id": doc_id,
                        "content": document,
                        "metadata": metadata,
                        "distance": distance,  # smaller = more similar
                        "rank": i + 1,
                    })

                print(f"Retrieved {len(retrieved_docs)} documents")
            else:
                print("No documents found")

        except Exception as e:
            print(f"Error during retrieval: {e}")

        # 4) Always return a list
        return retrieved_docs

In [12]:
rag_retriever = RAGRetriever(vectorstore, embedding_manager)

In [13]:
CATEGORY_TO_SOURCES = {
    "All (auto)": None,  
    "Pharmacy Act": ["pharmacy.pdf"],
    "Immunization Act": ["immunization.pdf"],
    "Constitution of Nepal": ["constitution.pdf"],
    "Single Women Act": ["single_women.pdf"],
    "Sports Act": ["sports.pdf"],
    
}

In [14]:
def build_where_for_category(category: str) -> dict | None:
    sources = CATEGORY_TO_SOURCES.get(category)
    if not sources:
        return None  

    
    if len(sources) == 1:
        return {"source_file": sources[0]}

   
    return {"$or": [{"source_file": s} for s in sources]}

In [15]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
import re

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")
if groq_api_key is None:
    raise ValueError("GROQ_API_KEY not set in environment or .env file")

llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama-3.1-8b-instant",
    temperature=0.1,
    max_tokens=1024,
)


# ---- helpers ----

def correct_nepali_spelling(text: str, llm) -> str:
    """
    Fix spelling and grammar in a Nepali answer without changing meaning.
    """
    prompt = f"""तलको पाठ नेपाली भाषामा लेखिएको छ तर वर्तनी (spelling) 
र व्याकरणमा केही त्रुटि हुन सक्छ।

तपाईंको काम:
- अर्थ (meaning) नबदलिकन, केवल वर्तनी र व्याकरण सुधार्नुहोस्।
- केवल देवनागरी लिपि (क, ख, ग ...) प्रयोग गर्नुहोस् (कुनै रोमन/Latin अक्षर नलेख्नुहोस्)।
- वाक्यसंख्या वा जानकारी नथप्नुहोस् वा नघटाउनुहोस्, केवल सुधार गर्नुहोस्।
- केवल सुधार गरिएको पाठ मात्र आउटपुट गर्नुहोस्, अरू कुनै व्याख्या नगर्नुहोस्।

पाठ:
{text}

सुधार गरिएको पाठ:
"""
    resp = llm.invoke(prompt)
    return resp.content.strip()


def expand_query_with_llm(query: str, llm) -> str:
    """
    Use LLM to expand/reformulate the question (add synonyms/context) in Nepali.
    This is the 'Expands Query' step in the RAG architecture.
    """
    prompt = f"""तपाईं प्रश्न पुनर्लेखन (query expansion) गर्ने सहयोगी हुनुहुन्छ।

कार्य:
- प्रयोगकर्ताको सोधेको प्रश्नलाई (query) परिवर्धित (expand) गर्नुहोस्।
- सोधिएको कानुनी विषय प्रस्ट देखियोस्; आवश्यक भए synonyms / सम्बन्धित शब्दहरू थप्नुहोस्।
- अर्थ (meaning) नबदलिकन, केवल स्पष्ट, विस्तारित प्रश्न नेपाली भाषामा लेख्नुहोस्।
- यदि प्रयोगकर्ताले रोमन नेपालीमा सोधेको भए, पहिले सही देवनागरीमा रूपान्तरण गिी त्यसलाई
  स्पष्ट, विस्तारित प्रश्नको रूपमा लेख्नुहोस्।
- केवल पुनर्लेखन गरिएको प्रश्न मात्र आउटपुट गर्नुहोस्, अरू कुनै व्याख्या नगर्नुहोस्।

उदाहरण:
- इनपुट: "nepali nagrita lina k k rules xa?"
  आउटपुट: "नेपालको संविधान तथा सम्बन्धित कानुन बमोजिम नेपाली नागरिकता लिनका लागि के–के नियम र शर्तहरू छन् ?"

इनपुट प्रश्न:
{query}

पुनर्लेखन/परिवर्धित प्रश्न (नेपालीमा):
"""
    resp = llm.invoke(prompt)
    return resp.content.strip()




def rag_simple(
    query: str,
    retriever,
    llm,
    top_k: int = 6,
    arena: str = "All (auto)",  
) -> str:
    # 0) Expand query using LLM
    expanded_query = expand_query_with_llm(query, llm)
    print("Original query:", query)
    print("Expanded query:", expanded_query)

    # 1) Normalize Roman Nepali -> Devanagari (if needed)
    norm_query = normalize_to_nepali(expanded_query, llm)
    print("Normalized query:", norm_query)

    # 2) Decide which PDF(s) to search

    # a) Category override (user selection)
    category_where = build_where_for_category(arena)

    # b) Auto detection using keywords (your existing choose_where)
    auto_where = choose_where(query, norm_query)

    # Final where:
    # - if user picked a category -> use category_where
    # - else -> fall back to auto_where
    if category_where is not None:
        where = category_where
    else:
        where = auto_where

    print("Using where filter:", where)

    # 3) Decide how deep to search (top_k)
    # If user picked a specific category, dig deeper into that law by using a higher top_k.
    if arena != "All (auto)":
        effective_top_k = max(top_k, 10)  # e.g., look at 10 chunks from that PDF
    else:
        effective_top_k = top_k

    # 4) Retrieve with where filter
    results = retriever.retrieve(norm_query, top_k=effective_top_k, where=where)

    # 4b) Fallback: if no results, try original query
    if not results:
        print("No results with normalized query, trying original query...")
        results = retriever.retrieve(query, top_k=effective_top_k, where=where)

    if not results:
        return "सहित सन्दर्भ (context) फेला परेन, त्यसैले म जवाफ दिन सक्दिन।"

    # ... (keep your context building, prompt, deduplication, spelling correction as you already have)

    # 4) Build context (truncate if too long)
    max_chars = 2000  # slightly smaller to reduce repetition
    context_parts = []
    current_len = 0
    for doc in results:
        text = doc["content"]
        if current_len + len(text) > max_chars:
            break
        context_parts.append(text)
        current_len += len(text)
    context = "\n\n".join(context_parts)

    # 5) Prompt for answer (updated to all laws + paraphrase)
    prompt = f"""तपाईं नेपाली कानुन बुझ्ने कानुनी सहायक हुनुहुन्छ। तल दिइएको सन्दर्भ 
फार्मेसी, खोप, संविधान, एकल महिला, खेलकुद लगायतका नेपाली कानून तथा नीतिहरूबाट 
लिइएको हो। सन्दर्भको मूल पाठमा टाइप/OCR सम्बन्धी त्रुटि हुन सक्छ।

नियम:
- सोधिएको प्रश्नको जवाफ आफूले बुझेको अर्थका आधारमा आफ्नै शब्दमा दिनुहोस्।
- सन्दर्भको ठ्याक्कै वाक्य वा अनुच्छेद जस्ताको तस्तै नक्कल गर्नु भन्दा,
  अर्थ/जानकारी समेटेर पुनर्लेखन (paraphrase) गर्नुहोस्।
- अधिकतम ५ बुँदामा मुख्य कुरा राख्नुहोस्, अनावश्यक रूपमा एउटै बुँदा दोहोर्याउनु हुँदैन।
- मानक, शुद्ध वर्तनी भएको देवनागरी नेपाली प्रयोग गर्नुहोस्।
- सकेसम्म धाराको/परिच्छेदको नम्बर वा शीर्षक (metadata मा section_number भएमा) उल्लेख गर्नुहोस्।
- यदि सन्दर्भमा सोधिएको विषयसँग स्पष्ट रूपमा सम्बन्धित कुनै जानकारी नै छैन भने मात्र
  यो वाक्य लेख्नुहोस्:
  "मलाई थाहा छैन। यो जानकारी दिइएको सन्दर्भमा छैन।"
- सामान्य ज्ञान मात्रबाट नयाँ कानुनी दाबी नबनाउनुहोस्।

सन्दर्भ:
{context}

प्रश्न (प्रयोगकर्ताको मूल इनपुट):
{query}

अन्तर्रूप (normalize) गरिएको प्रश्न:
{norm_query}

जवाफ नेपाली भाषामा:
"""
    response = llm.invoke(prompt)
    raw_answer = response.content.strip()

    # 6) Optionally de-duplicate nearly identical lines (avoid repetition)
    lines = [l.strip() for l in raw_answer.splitlines() if l.strip()]
    seen = set()
    dedup_lines = []
    for line in lines:
        norm_line = re.sub(r"\s+", " ", line)
        if norm_line not in seen:
            seen.add(norm_line)
            dedup_lines.append(line)
    dedup_answer = "\n".join(dedup_lines)

    # 7) Fix spelling/grammar in the answer
    final_answer = correct_nepali_spelling(dedup_answer, llm)

    return final_answer

In [16]:
def choose_where(query: str, norm_query: str) -> dict | None:
    """
    Decide which PDF to search based on keywords in the original and normalized query.
    Returns a dict for Chroma 'where' filter, or None to search all.
    """
    text = (query + " " + norm_query).lower()

    # Pharmacy
    if any(word in text for word in ["pharmacy", "pharmasi", "फार्मेसी"]):
        return {"source_file": "pharmacy.pdf"}

    # Immunization / खोप
    if any(word in text for word in ["immunization", "khop", "खोप", "इम्युनाइजेशन"]):
        return {"source_file": "immunization.pdf"}

    # Constitution – questions about constitution or citizenship
    if any(word in text for word in ["constitution", "संविधान", "citizenship", "नागरिकता", "nagrita"]):
        return {"source_file": "constitution.pdf"}

    # Single women / एकल महिला
    if any(word in text for word in ["single women", "single woman", "एकल महिला", "विधवा"]):
        return {"source_file": "single_women.pdf"}

    # Sports
    if any(word in text for word in ["sports", "खेलकुद", "खेल"]):
        return {"source_file": "sports.pdf"}

    # Default: search all
    return None

In [17]:
import re

def contains_devanagari(text: str) -> bool:
    """Return True if any Devanagari characters are present."""
    return bool(re.search(r'[\u0900-\u097F]', text))

In [18]:
def normalize_to_nepali(query: str, llm) -> str:
    """
    If query is Romanized Nepali (Latin script), convert it to Nepali in Devanagari.
    If it's already in Devanagari, return as-is.
    """
    # If it already has Devanagari, don't touch it
    if contains_devanagari(query):
        return query

    prompt = f"""You are a transliteration engine, not a chatbot.

TASK:
- Convert Romanized Nepali written in Latin script into correct Nepali in Devanagari.
- Do NOT translate, rephrase, or change the meaning.
- Do NOT guess a different question.
- Keep all words; if you don't know how to transliterate a word, copy it as-is.
- Preserve question structure (question marks etc.).
- Output ONLY the converted sentence, no explanation.

GOOD examples (do this):
- "pharmacy ain le ke vanxa?" -> "फार्मेसी ऐनले के भन्छ ?"
- "pharmacy council le yo ainma ke vanxa?" -> "फार्मेसी काउन्सिलले यो ऐनमा के भन्छ ?"
- "immunization act kaile aayeko ?" -> "इम्युनाइजेशन ऐन कहिले आएको ?"
- "yo ain namane kehi karbahi hunxa?" -> "यो ऐन नमाने केहि कारबाही हुन्छ ?"
- "nepalko sambidhan ke ho?" -> "नेपालको संविधान के हो ?"
- "ekal mahila ko ke ke adhikar chan?" -> "एकल महिलाको के के अधिकार छन् ?"
- "khel ko bikas ko lagi ke byabastha cha?" -> "खेलको विकासका लागि के व्यवस्था छ ?"

BAD examples (never do this):
- Changing "pharmacy ain le ke vanxa?" into 
  "फार्मेसी व्यवसाय सञ्चालन गर्न के–के शर्त चाहिन्छ?"  ✗  (WRONG: different meaning)
- Changing topic or inventing extra information.

User input:
{query}

Output (only the transliterated Nepali sentence):
"""

    resp = llm.invoke(prompt)
    normalized = resp.content.strip()
    return normalized

In [None]:
import gradio as gr

def rag_chat(message, history, arena):
    """
    message: user question
    history: chat history (managed by ChatInterface)
    arena: selected legal arena from dropdown
    """
    answer = rag_simple(message, rag_retriever, llm, top_k=6, arena=arena)
    return answer

with gr.Blocks() as demo:
    # Dropdown must be created inside Blocks
    arena_dropdown = gr.Dropdown(
        choices=[
            "All (auto)",
            "Pharmacy Act",
            "Immunization Act",
            "Constitution of Nepal",
            "Single Women Act",
            "Sports Act",
        ],
        value="All (auto)",
        label="Select legal context (arena)",
    )

    chat = gr.ChatInterface(
        fn=rag_chat,  # expects (message, history, arena)
        title="AI Lawyer – Nepali Health Law Assistant",
        description=(
            "फार्मेसी परिषद् ऐन, २०५७, खोप ऐन, २०७२, संविधान, एकल महिला, खेलकुदसम्बन्धी कानुनहरूको RAG‑आधारित सहायक।\n\n"
            "> **Disclaimer:** यो केवल सूचना/शैक्षिक उद्देश्यको AI सहायक हो। "
            "कानुनी सल्लाहका लागि दर्ता भएका वकिलसँग परामर्श गर्नुहोस्।"
        ),
        examples=[
            ["फार्मेसी व्यवसाय दर्ता कसरी हुन्छ?"],
            ["खोप ऐन, २०७२ कहिले लागू भयो?"],
            ["नेपालको संविधानले स्वास्थ्य अधिकारबारे के भन्छ?"],
            ["एकल महिलाको अधिकार के–के छन्?"],
            ["sports development kina chahinxa?"],
        ],
        additional_inputs=[arena_dropdown],  # now Gradio passes arena as 3rd arg
    )

demo.launch()

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.




Original query: sports development kina chahinxa?
Expanded query: स्पोर्ट्स विकास किन गरिन्छ ? स्पोर्ट्स विकासका लागि के–के कारण र उद्देश्य छन् ?
Normalized query: स्पोर्ट्स विकास किन गरिन्छ ? स्पोर्ट्स विकासका लागि के–के कारण र उद्देश्य छन् ?
Using where filter: {'source_file': 'sports.pdf'}
Retrieving documents for query: 'स्पोर्ट्स विकास किन गरिन्छ ? स्पोर्ट्स विकासका लागि के–के कारण र उद्देश्य छन् ?'
Top k: 6, where: {'source_file': 'sports.pdf'}
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.16it/s]

Generated embeddings with shape: (1, 768)
Retrieved 6 documents





Original query: constitution ma k xa?
Expanded query: नेपालको संविधान तथा सम्बन्धित कानुन बमोजिम के नियम र शर्तहरू छन् ?
Normalized query: नेपालको संविधान तथा सम्बन्धित कानुन बमोजिम के नियम र शर्तहरू छन् ?
Using where filter: {'source_file': 'constitution.pdf'}
Retrieving documents for query: 'नेपालको संविधान तथा सम्बन्धित कानुन बमोजिम के नियम र शर्तहरू छन् ?'
Top k: 6, where: {'source_file': 'constitution.pdf'}
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.67it/s]

Generated embeddings with shape: (1, 768)
Retrieved 6 documents





Original query: what is pharmacy ain
Expanded query: फार्मेसी के के हो ?
Normalized query: फार्मेसी के के हो ?
Using where filter: {'source_file': 'pharmacy.pdf'}
Retrieving documents for query: 'फार्मेसी के के हो ?'
Top k: 6, where: {'source_file': 'pharmacy.pdf'}
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.61it/s]

Generated embeddings with shape: (1, 768)
Retrieved 6 documents





Original query: nepali nagrita lina k garna parxa?
Expanded query: नेपाली नागरिकता लिने प्रक्रिया के हो ?
Normalized query: नेपाली नागरिकता लिने प्रक्रिया के हो ?
Using where filter: {'source_file': 'constitution.pdf'}
Retrieving documents for query: 'नेपाली नागरिकता लिने प्रक्रिया के हो ?'
Top k: 6, where: {'source_file': 'constitution.pdf'}
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.74it/s]

Generated embeddings with shape: (1, 768)
Retrieved 6 documents



