In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MONGO_URI = ""
QDRANT_URL = ""
QDRANT_API_KEY = ""
GROQ_API_KEY = ""
DOCS_DIR = "/content/drive/MyDrive/Legal"

In [3]:
!pip install pymongo qdrant-client pdfplumber sentence-transformers groq tqdm langdetect

Collecting pymongo
  Downloading pymongo-4.15.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (22 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.16.2-py3-none-any.whl.metadata (11 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspyth

In [4]:
import os
import re
import uuid
import pdfplumber
from tqdm import tqdm
from langdetect import detect
from pymongo import MongoClient
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from groq import Groq

In [6]:
# MongoDB
mongo_client = MongoClient(MONGO_URI)
db = mongo_client["nepali_legal_db"]
meta_col = db["documents"]

# Ensure unique constraint (prevents duplicates)
meta_col.create_index("qdrant_id", unique=True)

# Qdrant
qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)

COLLECTION_NAME = "nepali_law_vectors"

# Create collection if not exists
if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(
            size=1024,
            distance=Distance.COSINE
        )
    )

In [7]:
embedding_model = SentenceTransformer("intfloat/multilingual-e5-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [8]:
def detect_doc_type(path):
    p = path.lower()
    if "‡§∏‡§Ç‡§µ‡§ø‡§ß‡§æ‡§®" in p:
        return "constitution", 1
    if "‡§Æ‡•Å‡§≤‡•Å‡§ï‡•Ä" in p:
        return "muluki_act", 2
    if "‡§®‡§ø‡§Ø‡§Æ" in p or "‡§µ‡§ø‡§®‡§ø‡§Ø‡§Æ" in p:
        return "rule", 4
    return "act", 3

In [9]:
def extract_structure(text):
    structure = {
        "‡§≠‡§æ‡§ó": None,
        "‡§≠‡§æ‡§ó_title": None,
        "‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶": None,
        "‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶_title": None,
        "‡§¶‡§´‡§æ": None,
        "‡§â‡§™‡§¶‡§´‡§æ": None
    }

    part = re.search(r"(‡§≠‡§æ‡§ó‚Äì?\s*\d+)\s*(.*)", text)
    if part:
        structure["‡§≠‡§æ‡§ó"] = part.group(1)
        structure["‡§≠‡§æ‡§ó_title"] = part.group(2)

    pariched = re.search(r"(‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶‚Äì?\s*\d+)\s*(.*)", text)
    if pariched:
        structure["‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶"] = pariched.group(1)
        structure["‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶_title"] = pariched.group(2)

    dafa = re.search(r"(‡§¶‡§´‡§æ\s*\d+)", text)
    if dafa:
        structure["‡§¶‡§´‡§æ"] = dafa.group(1)

    up = re.search(r"(\(\d+\))", text)
    if up:
        structure["‡§â‡§™‡§¶‡§´‡§æ"] = up.group(1)

    return structure

In [10]:
def chunk_text(text):
    chunks = re.split(r"\n(?=‡§¶‡§´‡§æ\s*\d+)", text)
    return [c.strip() for c in chunks if len(c.strip()) > 100]

In [11]:
def ingest_documents(base_dir):
    pdf_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".pdf"):
                pdf_files.append(os.path.join(root, f))

    for pdf_path in tqdm(pdf_files):
        doc_type, priority = detect_doc_type(pdf_path)

        with pdfplumber.open(pdf_path) as pdf:
            full_text = "\n".join(
                page.extract_text() or "" for page in pdf.pages
            )

        chunks = chunk_text(full_text)

        for chunk in chunks:
            structure = extract_structure(chunk)

            # Prevent duplicate insert
            exists = meta_col.find_one({
                "text": chunk,
                "doc_type": doc_type
            })
            if exists:
                continue

            qdrant_id = str(uuid.uuid4())
            embedding = embedding_model.encode(
                "passage: " + chunk
            ).tolist()

            qdrant.upsert(
                collection_name=COLLECTION_NAME,
                points=[
                    PointStruct(
                        id=qdrant_id,
                        vector=embedding,
                        payload={"doc_type": doc_type}
                    )
                ]
            )

            meta_col.insert_one({
                "qdrant_id": qdrant_id,
                "doc_type": doc_type,
                "priority": priority,
                "file_path": pdf_path,
                **structure,
                "text": chunk
            })

In [12]:
ingest_documents(DOCS_DIR)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 611/611 [1:28:16<00:00,  8.67s/it]


In [22]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

In [28]:
def retrieve_context(query, k=5):
    query_vec = embedding_model.encode(
        "query: " + query
    ).tolist()

    final_docs = []

    # 1Ô∏è‚É£ Constitution ‚Üí Muluki Act ‚Üí Act
    for doc_type in ["constitution", "muluki_act", "act"]:
        hits = qdrant.query_points(
            collection_name=COLLECTION_NAME,
            query=query_vec,
            limit=3,
            with_payload=True,
            query_filter=Filter(
                must=[
                    FieldCondition(
                        key="doc_type",
                        match=MatchValue(value=doc_type)
                    )
                ]
            )
        )

        if hits.points:
            for p in hits.points:
                meta = meta_col.find_one({"qdrant_id": p.id})
                if meta:
                    final_docs.append(meta)
            break  # stop at highest authority found

    # 2Ô∏è‚É£ ALWAYS append Rules & Regulations
    rule_hits = qdrant.query_points(
        collection_name=COLLECTION_NAME,
        query=query_vec,
        limit=5,
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="doc_type",
                    match=MatchValue(value="rule")
                )
            ]
        )
    )

    for p in rule_hits.points:
        meta = meta_col.find_one({"qdrant_id": p.id})
        if meta:
            final_docs.append(meta)

    return final_docs

In [29]:
def decide_language(query):
    if "nepali" in query.lower() or detect(query) == "ne":
        return "ne"
    return "en"

In [30]:
from qdrant_client.models import PayloadSchemaType

qdrant.create_payload_index(
    collection_name=COLLECTION_NAME,
    field_name="doc_type",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=880, status=<UpdateStatus.COMPLETED: 'completed'>)

In [33]:
def build_context(docs, max_chars=9000):
    """
    Builds a compact legal context without exceeding model limits
    """
    context_blocks = []
    total_chars = 0

    for d in docs:
        header = (
            f"Law Type: {d['doc_type']}\n"
            f"{d.get('‡§≠‡§æ‡§ó','')} {d.get('‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶','')} "
            f"{d.get('‡§¶‡§´‡§æ','')} {d.get('‡§â‡§™‡§¶‡§´‡§æ','')}\n"
        )

        body = d["text"]

        block = header + body + "\n\n"

        if total_chars + len(block) > max_chars:
            break

        context_blocks.append(block)
        total_chars += len(block)

    return "".join(context_blocks)

In [40]:
groq_client = Groq(api_key=GROQ_API_KEY)

def generate_answer(query):
    docs = retrieve_context(query)
    lang = decide_language(query)

    context = build_context(docs)

    system_prompt = (
        "You are a legal assistant for Nepali law. "
        "Follow legal hierarchy strictly:\n"
        "1) Constitution\n"
        "2) Muluki Act\n"
        "3) Acts\n"
        "4) Rules (procedure only).\n\n"
        "Always cite law name, part, chapter (‡§™‡§∞‡§ø‡§ö‡•ç‡§õ‡•á‡§¶), and section (‡§¶‡§´‡§æ). "
        "Do not hallucinate."
    )

    if lang == "ne":
        system_prompt = (
            "‡§§‡§™‡§æ‡§à‡§Ç ‡§®‡•á‡§™‡§æ‡§≤‡§ï‡•ã ‡§ï‡§æ‡§®‡•Å‡§®‡§Æ‡§æ ‡§Ü‡§ß‡§æ‡§∞‡§ø‡§§ ‡§∏‡§π‡§æ‡§Ø‡§ï ‡§π‡•Å‡§®‡•Å‡§π‡•Å‡§®‡•ç‡§õ‡•§ "
            "‡§ï‡§æ‡§®‡•Å‡§®‡•Ä ‡§™‡•ç‡§∞‡§æ‡§•‡§Æ‡§ø‡§ï‡§§‡§æ ‡§™‡§æ‡§≤‡§®‡§æ ‡§ó‡§∞‡•ç‡§®‡•Å‡§π‡•ã‡§∏‡•ç‡•§"
        )

    completion = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion:\n{query}"
            }
        ],
        temperature=0.2,
        max_tokens=600  # üëà VERY IMPORTANT
    )

    return completion.choices[0].message.content

In [48]:
print(generate_answer("My wife uses me for my money, i want to get a divorce, how can i get it?"))

I can provide you with general information on the divorce process in Nepal, but please note that this is not a substitute for personalized legal advice.

According to the Muluki Ain (‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§∏‡§Ç‡§µ‡§ø‡§ß‡§æ‡§®, ‡•®‡•¶‡•≠‡•®), Chapter 9, Part 3, Section 17, a marriage can be dissolved by a court order. 

You can file a petition for divorce under the Muluki Ain (‡§®‡•á‡§™‡§æ‡§≤‡•Ä ‡§∏‡§Ç‡§µ‡§ø‡§ß‡§æ‡§®, ‡•®‡•¶‡•≠‡•®), Chapter 9, Part 3, Section 17. 

You will need to provide grounds for divorce, which can be one of the following:

- Mutual consent (Section 17, Chapter 9, Part 3, Muluki Ain)
- Irreconcilable differences (Section 18, Chapter 9, Part 3, Muluki Ain)
- Desertion (Section 19, Chapter 9, Part 3, Muluki Ain)
- Cruelty (Section 20, Chapter 9, Part 3, Muluki Ain)
- Adultery (Section 21, Chapter 9, Part 3, Muluki Ain)
- Impotency (Section 22, Chapter 9, Part 3, Muluki Ain)
- Bigamy (Section 23, Chapter 9, Part 3, Muluki Ain)
- Rape (Section 24, Chapter 9, Part 3, Mulu