# Neethi App - Offline Data Pipeline

This notebook combines all offline data preparation scripts into a single executable workflow for injecting data into Qdrant.

## Pipeline Steps
1. **Download Datasets** - Download legal datasets from HuggingFace
2. **Data Cleaning** - Clean and standardize IPC/BNS/BSA data
3. **Semantic Chunking** - Create embeddings-ready chunks
4. **Prepare Training Data** - Generate triplets for fine-tuning (optional)
5. **Fine-tune Model** - Fine-tune InLegalBERT (requires GPU)
6. **Populate Qdrant** - Index statute chunks into Qdrant
7. **Ingest Bail Judgments** - Download and index bail judgments
8. **Extract SC Judgments** - Extract 26K SC judgments from PDFs
9. **Ingest SC Judgments** - Index SC judgments into Qdrant

## Prerequisites
- Qdrant Cloud account with QDRANT_URL and QDRANT_API_KEY in .env
- Required packages: `qdrant-client`, `sentence-transformers`, `datasets`, `loguru`, `pandas`

## Setup & Dependencies

In [None]:
# Install required packages (uncomment if needed)
# !pip install qdrant-client sentence-transformers datasets loguru pandas python-dotenv pdfplumber kagglehub huggingface_hub

In [None]:
import os
import sys
import json
import re
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Generator, Any, Tuple
from dataclasses import dataclass, field, asdict
from dotenv import load_dotenv
from loguru import logger

# Load environment variables
load_dotenv()

# Configure logging
logger.add("logs/offline_pipeline_{time}.log", rotation="10 MB", level="INFO")

# Create directories
Path("data/raw").mkdir(parents=True, exist_ok=True)
Path("data/processed").mkdir(parents=True, exist_ok=True)
Path("data/training").mkdir(parents=True, exist_ok=True)
Path("logs").mkdir(exist_ok=True)

print("‚úÖ Setup complete")
print(f"QDRANT_URL: {os.getenv('QDRANT_URL', 'NOT SET')[:50]}...")

## Step 1: Download Datasets from HuggingFace

In [None]:
def download_aalap(output_dir: Path, max_samples: Optional[int] = None) -> Dict[str, int]:
    """Download Aalap instruction dataset from HuggingFace"""
    from datasets import load_dataset
    
    logger.info("Downloading Aalap instruction dataset...")
    
    try:
        ds = load_dataset("opennyaiorg/aalap_instruction_dataset", split="train")
        logger.info(f"Aalap columns: {ds.column_names}")
        
        task_counts = {}
        all_examples = []
        
        for i, example in enumerate(ds):
            if max_samples and i >= max_samples:
                break
            
            normalized = dict(example)
            normalized["id"] = f"aalap_{i}"
            normalized["source"] = "aalap"
            
            task = example.get("task_type") or example.get("task") or "instruction"
            normalized["task"] = task
            task_counts[task] = task_counts.get(task, 0) + 1
            all_examples.append(normalized)
        
        output_path = output_dir / "aalap_train.jsonl"
        with open(output_path, 'w', encoding='utf-8') as f:
            for ex in all_examples:
                f.write(json.dumps(ex, ensure_ascii=False) + '\n')
        
        logger.info(f"‚úÖ Aalap saved: {len(all_examples)} examples")
        return task_counts
    except Exception as e:
        logger.error(f"‚ùå Failed: {e}")
        return {}

def download_legal_texts(output_dir: Path, max_samples: Optional[int] = None) -> int:
    """Download Indian legal texts dataset"""
    from huggingface_hub import hf_hub_download
    
    logger.info("Downloading Indian legal texts...")
    
    repo_id = "Techmaestro369/indian-legal-texts-finetuning"
    files = ["ipc_qa.json", "crpc_qa.json", "constitution_qa.json"]
    all_examples = []
    
    for filename in files:
        try:
            file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            source = filename.replace("_qa.json", "").upper()
            for i, item in enumerate(data):
                if max_samples and len(all_examples) >= max_samples:
                    break
                normalized = {
                    "id": f"legal_texts_{source}_{i}",
                    "question": item.get("question", ""),
                    "answer": item.get("answer", ""),
                    "source": source,
                    "section": item.get("section", "")
                }
                all_examples.append(normalized)
            logger.info(f"  ‚úÖ {filename}: {len(data)} examples")
        except Exception as e:
            logger.warning(f"  ‚ö†Ô∏è Failed {filename}: {e}")
    
    output_path = output_dir / "legal_texts.jsonl"
    with open(output_path, 'w', encoding='utf-8') as f:
        for ex in all_examples:
            f.write(json.dumps(ex, ensure_ascii=False) + '\n')
    
    logger.info(f"‚úÖ Legal texts saved: {len(all_examples)} examples")
    return len(all_examples)

In [None]:
# Run dataset download
output_dir = Path("data/training")

print("üì• Downloading datasets...")
task_counts = download_aalap(output_dir)
legal_count = download_legal_texts(output_dir)

print(f"\nüìä Download Summary:")
print(f"   Aalap tasks: {task_counts}")
print(f"   Legal texts: {legal_count}")

## Step 2: Data Cleaning (IPC/BNS/BSA)

In [None]:
class LegalDataCleaner:
    """Cleans and standardizes legal datasets for RAG ingestion."""
    
    def __init__(self, raw_dir: str = "data/raw", processed_dir: str = "data/processed"):
        self.raw_dir = Path(raw_dir)
        self.processed_dir = Path(processed_dir)
        self.processed_dir.mkdir(parents=True, exist_ok=True)
        self.stats = {"total": 0, "cleaned": 0, "skipped": 0}
    
    def clean_ipc_dataset(self, filename: str = "ipc_sections.csv") -> str:
        """Clean IPC sections dataset"""
        filepath = self.raw_dir / filename
        if not filepath.exists():
            logger.warning(f"File not found: {filepath}")
            return ""
        
        df = pd.read_csv(filepath, encoding='utf-8')
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        cleaned_records = []
        for idx, row in df.iterrows():
            section_num = str(row.get('section', row.get('section_num', ''))).strip()
            if not section_num or section_num.lower() == 'nan':
                continue
            
            section_num = re.sub(r'[^\dA-Za-z]', '', section_num)
            description = str(row.get('description', '')).strip()
            offense = str(row.get('offense', '')).strip()
            punishment = str(row.get('punishment', '')).strip()
            
            text = f"IPC Section {section_num}: {description}"
            if offense and offense.lower() != 'nan':
                text += f". Offense: {offense}"
            if punishment and punishment.lower() != 'nan':
                text += f". Punishment: {punishment}"
            
            cleaned_records.append({
                "id": f"IPC_{section_num}",
                "section_num": section_num,
                "law_type": "IPC",
                "description": description,
                "offense": offense,
                "punishment": punishment,
                "text": text
            })
        
        output_path = self.processed_dir / "ipc_clean.jsonl"
        with open(output_path, 'w', encoding='utf-8') as f:
            for rec in cleaned_records:
                f.write(json.dumps(rec, ensure_ascii=False) + '\n')
        
        self.stats["cleaned"] += len(cleaned_records)
        logger.info(f"‚úÖ Cleaned {len(cleaned_records)} IPC sections")
        return str(output_path)
    
    def clean_bns_dataset(self, filename: str = "bns_sections.csv") -> str:
        """Clean BNS sections dataset"""
        filepath = self.raw_dir / filename
        if not filepath.exists():
            logger.warning(f"File not found: {filepath}")
            return ""
        
        df = pd.read_csv(filepath, encoding='utf-8')
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        cleaned_records = []
        for idx, row in df.iterrows():
            section_num = str(row.get('section', '')).strip()
            if not section_num or section_num.lower() == 'nan':
                continue
            
            section_num = re.sub(r'[^\dA-Za-z]', '', section_num)
            section_name = str(row.get('section_name', '')).strip()
            description = str(row.get('description', section_name)).strip()
            
            text = f"BNS Section {section_num}"
            if section_name and section_name.lower() != 'nan':
                text += f" ({section_name})"
            text += f": {description}"
            
            cleaned_records.append({
                "id": f"BNS_{section_num}",
                "section_num": section_num,
                "law_type": "BNS",
                "section_name": section_name,
                "description": description,
                "text": text
            })
        
        output_path = self.processed_dir / "bns_clean.jsonl"
        with open(output_path, 'w', encoding='utf-8') as f:
            for rec in cleaned_records:
                f.write(json.dumps(rec, ensure_ascii=False) + '\n')
        
        self.stats["cleaned"] += len(cleaned_records)
        logger.info(f"‚úÖ Cleaned {len(cleaned_records)} BNS sections")
        return str(output_path)

In [None]:
# Run data cleaning (requires CSV files in data/raw/)
cleaner = LegalDataCleaner()

print("üßπ Cleaning datasets...")
ipc_output = cleaner.clean_ipc_dataset()
bns_output = cleaner.clean_bns_dataset()

print(f"\nüìä Cleaning Summary:")
print(f"   Total cleaned: {cleaner.stats['cleaned']}")
if ipc_output:
    print(f"   IPC: {ipc_output}")
if bns_output:
    print(f"   BNS: {bns_output}")

## Step 3: Semantic Chunking

In [None]:
@dataclass
class Chunk:
    """A single chunk ready for embedding"""
    chunk_id: str
    text: str
    law_type: str
    section_num: Optional[str] = None
    metadata: Dict = field(default_factory=dict)
    
    def to_dict(self) -> Dict:
        return asdict(self)

class SemanticChunker:
    """Creates chunks from legal documents"""
    
    def __init__(self, processed_dir: str = "data/processed"):
        self.processed_dir = Path(processed_dir)
        self.stats = {"chunks": 0, "by_type": {}}
    
    def chunk_statutes(self) -> Generator[Chunk, None, None]:
        """Process statute files"""
        for filename in ["ipc_clean.jsonl", "bns_clean.jsonl"]:
            filepath = self.processed_dir / filename
            if not filepath.exists():
                continue
            
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    record = json.loads(line.strip())
                    text = record.get("text", "")
                    if len(text) < 20:
                        continue
                    
                    law_type = record.get("law_type", "UNKNOWN")
                    section_num = record.get("section_num", "")
                    
                    chunk = Chunk(
                        chunk_id=f"{law_type}_{section_num}",
                        text=text,
                        law_type=law_type,
                        section_num=section_num,
                        metadata={"offense": record.get("offense", ""), "punishment": record.get("punishment", "")}
                    )
                    self.stats["chunks"] += 1
                    self.stats["by_type"][law_type] = self.stats["by_type"].get(law_type, 0) + 1
                    yield chunk
    
    def run(self, output_file: str = "chunks.jsonl") -> str:
        """Run chunking pipeline"""
        output_path = self.processed_dir / output_file
        
        with open(output_path, 'w', encoding='utf-8') as f:
            for chunk in self.chunk_statutes():
                f.write(json.dumps(chunk.to_dict(), ensure_ascii=False) + '\n')
        
        logger.info(f"‚úÖ Created {self.stats['chunks']} chunks")
        return str(output_path)

In [None]:
# Run chunking
chunker = SemanticChunker()
output_path = chunker.run()

print(f"\nüìä Chunking Summary:")
print(f"   Total chunks: {chunker.stats['chunks']}")
print(f"   By type: {chunker.stats['by_type']}")
print(f"   Output: {output_path}")

## Step 4: Populate Qdrant with Statute Chunks

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer

def populate_qdrant(
    chunks_file: str = "data/processed/chunks.jsonl",
    collection_name: str = "neethi-legal-kb",
    embedding_model: str = "law-ai/InLegalBERT",
    batch_size: int = 32,
    recreate: bool = False
):
    """Embed chunks and index into Qdrant"""
    
    # Connect to Qdrant
    qdrant_url = os.getenv("QDRANT_URL")
    qdrant_key = os.getenv("QDRANT_API_KEY")
    
    if not qdrant_url:
        raise ValueError("QDRANT_URL not set!")
    
    client = QdrantClient(url=qdrant_url, api_key=qdrant_key, timeout=60)
    logger.info("‚úÖ Connected to Qdrant")
    
    # Load embedder
    logger.info(f"Loading {embedding_model}...")
    embedder = SentenceTransformer(embedding_model)
    vector_size = embedder.get_sentence_embedding_dimension()
    logger.info(f"‚úÖ Embedder loaded (dim={vector_size})")
    
    # Create collection
    collections = [c.name for c in client.get_collections().collections]
    if collection_name in collections:
        if recreate:
            client.delete_collection(collection_name)
            logger.warning(f"Deleted existing collection: {collection_name}")
        else:
            logger.info(f"Collection exists: {collection_name}")
    
    if collection_name not in collections or recreate:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
        )
        logger.info(f"‚úÖ Created collection: {collection_name}")
    
    # Load chunks
    chunks = []
    with open(chunks_file, 'r', encoding='utf-8') as f:
        for line in f:
            chunks.append(json.loads(line.strip()))
    logger.info(f"Loaded {len(chunks)} chunks")
    
    # Embed and index
    texts = [c.get("text", "") for c in chunks]
    logger.info("Generating embeddings...")
    embeddings = embedder.encode(texts, batch_size=batch_size, show_progress_bar=True)
    
    # Create points
    points = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        payload = {
            "chunk_id": chunk.get("chunk_id", f"chunk_{i}"),
            "text": chunk.get("text", ""),
            "law_type": chunk.get("law_type", ""),
            "section_num": chunk.get("section_num"),
        }
        points.append(PointStruct(id=i+1, vector=embedding.tolist(), payload=payload))
    
    # Upsert in batches
    for i in range(0, len(points), batch_size):
        batch = points[i:i+batch_size]
        client.upsert(collection_name=collection_name, points=batch)
    
    info = client.get_collection(collection_name)
    logger.info(f"‚úÖ Indexed {len(points)} chunks. Collection has {info.points_count} vectors.")
    
    return len(points)

In [None]:
# Run Qdrant population
indexed = populate_qdrant(recreate=False)
print(f"\n‚úÖ Indexed {indexed} statute chunks into Qdrant!")

## Step 5: Ingest Bail Judgments

In [None]:
def ingest_bail_judgments(collection_name: str = "neethi-bail-judgments", batch_size: int = 50):
    """Download and index bail judgments from HuggingFace"""
    from datasets import load_dataset
    from qdrant_client import QdrantClient
    from qdrant_client.models import VectorParams, Distance, PointStruct
    from sentence_transformers import SentenceTransformer
    
    # Download dataset
    logger.info("Downloading IndianBailJudgments-1200...")
    dataset = load_dataset("SnehaDeshmukh/IndianBailJudgments-1200")
    cases = list(dataset["train"])
    logger.info(f"‚úÖ Downloaded {len(cases)} bail judgments")
    
    # Initialize clients
    client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))
    embedder = SentenceTransformer("law-ai/InLegalBERT")
    
    # Create collection
    collections = [c.name for c in client.get_collections().collections]
    if collection_name in collections:
        client.delete_collection(collection_name)
    
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE)
    )
    logger.info(f"‚úÖ Created collection: {collection_name}")
    
    # Process and index
    points = []
    for i, case in enumerate(cases):
        bail_outcome = case.get("bail_outcome", case.get("Bail Outcome", "Unknown"))
        ipc_raw = case.get("ipc_sections", case.get("IPC Sections", ""))
        ipc_sections = [s.strip() for s in str(ipc_raw).split(",") if s.strip()] if ipc_raw else []
        summary = case.get("summary", case.get("Summary", ""))
        
        text_for_embed = f"Bail {bail_outcome}. IPC: {', '.join(ipc_sections)}. {summary[:1000]}"
        embedding = embedder.encode(text_for_embed).tolist()
        
        payload = {
            "case_id": str(case.get("id", f"bail_{i}")),
            "bail_outcome": bail_outcome,
            "ipc_sections": ipc_sections,
            "summary": summary,
            "law_type": "Bail Judgment",
            "section_num": f"Case-{i}"
        }
        points.append(PointStruct(id=i, vector=embedding, payload=payload))
        
        if len(points) >= batch_size:
            client.upsert(collection_name=collection_name, points=points)
            logger.info(f"Indexed {i+1}/{len(cases)}")
            points = []
    
    if points:
        client.upsert(collection_name=collection_name, points=points)
    
    logger.info(f"‚úÖ Indexed {len(cases)} bail judgments")
    return len(cases)

In [None]:
# Run bail judgments ingestion
bail_count = ingest_bail_judgments()
print(f"\n‚úÖ Indexed {bail_count} bail judgments into Qdrant!")

## Step 6: Extract SC Judgments 26K (requires Kaggle)

In [None]:
def extract_sc_judgments(limit: Optional[int] = None):
    """Extract SC judgments from Kaggle PDFs"""
    import kagglehub
    import pdfplumber
    import glob
    from multiprocessing import Pool, cpu_count
    
    logger.info("Downloading SC judgments from Kaggle...")
    cache_path = kagglehub.dataset_download("adarshsingh0903/legal-dataset-sc-judgments-india-19502024")
    logger.info(f"Dataset path: {cache_path}")
    
    all_pdfs = glob.glob(f"{cache_path}/**/*.pdf", recursive=True)
    all_pdfs += glob.glob(f"{cache_path}/**/*.PDF", recursive=True)
    all_pdfs = list(set(all_pdfs))
    logger.info(f"Found {len(all_pdfs)} PDFs")
    
    if limit:
        all_pdfs = all_pdfs[:limit]
    
    output_file = Path("data/processed/sc_judgments_26k.jsonl")
    
    def extract_pdf(pdf_path):
        try:
            filename = os.path.basename(pdf_path)
            case_id = filename.replace(".pdf", "").replace(".PDF", "")
            
            with pdfplumber.open(pdf_path) as pdf:
                full_text = ""
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text += text + "\n"
            
            if len(full_text) < 100:
                return None
            
            # Extract URL
            url_match = re.search(r'indiankanoon\.org/doc/(\d+)', full_text)
            doc_url = f"https://indiankanoon.org/doc/{url_match.group(1)}/" if url_match else ""
            
            # Chunk text
            chunks = []
            words = full_text.split()
            for i in range(0, len(words), 400):
                chunk_text = " ".join(words[i:i+400])
                chunks.append({
                    "case_id": case_id,
                    "filename": filename,
                    "doc_url": doc_url,
                    "chunk_idx": i // 400,
                    "text": chunk_text,
                    "year": "Unknown"
                })
            return chunks
        except Exception as e:
            return None
    
    total_chunks = 0
    with open(output_file, "w", encoding="utf-8") as f:
        for i, pdf_path in enumerate(all_pdfs):
            chunks = extract_pdf(pdf_path)
            if chunks:
                for chunk in chunks:
                    f.write(json.dumps(chunk) + "\n")
                    total_chunks += 1
            if (i + 1) % 100 == 0:
                logger.info(f"Processed {i+1}/{len(all_pdfs)} PDFs, {total_chunks} chunks")
    
    logger.info(f"‚úÖ Extracted {total_chunks} chunks to {output_file}")
    return total_chunks

In [None]:
# Run SC extraction (set limit for testing)
# sc_chunks = extract_sc_judgments(limit=100)  # Uncomment to run
print("‚è∏Ô∏è SC extraction skipped (uncomment to run)")

## Step 7: Ingest SC Judgments into Qdrant

In [None]:
def ingest_sc_judgments(
    input_file: str = "data/processed/sc_judgments_26k.jsonl",
    collection_name: str = "neethi-judgments",
    batch_size: int = 100
):
    """Ingest SC judgment chunks into Qdrant"""
    from qdrant_client import QdrantClient
    from qdrant_client.models import VectorParams, Distance, PointStruct
    from sentence_transformers import SentenceTransformer
    import uuid
    
    if not Path(input_file).exists():
        logger.error(f"Input file not found: {input_file}")
        return 0
    
    # Count chunks
    total_chunks = sum(1 for _ in open(input_file, encoding="utf-8"))
    logger.info(f"Total chunks to process: {total_chunks}")
    
    # Initialize
    client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"), timeout=120)
    embedder = SentenceTransformer("law-ai/InLegalBERT")
    
    # Check/create collection
    collections = [c.name for c in client.get_collections().collections]
    if collection_name not in collections:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=768, distance=Distance.COSINE)
        )
        logger.info(f"‚úÖ Created collection: {collection_name}")
    
    # Process in batches
    indexed = 0
    batch_texts, batch_payloads = [], []
    
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            chunk = json.loads(line)
            batch_texts.append(chunk["text"])
            batch_payloads.append({
                "case_id": chunk["case_id"],
                "doc_url": chunk.get("doc_url", ""),
                "text": chunk["text"],
                "law_type": "SC Judgment",
                "section_num": chunk["case_id"]
            })
            
            if len(batch_texts) >= batch_size:
                embeddings = embedder.encode(batch_texts, show_progress_bar=False)
                points = [
                    PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=pay)
                    for emb, pay in zip(embeddings, batch_payloads)
                ]
                client.upsert(collection_name=collection_name, points=points)
                indexed += len(points)
                
                if indexed % 5000 == 0:
                    logger.info(f"Indexed: {indexed}/{total_chunks}")
                
                batch_texts, batch_payloads = [], []
    
    # Final batch
    if batch_texts:
        embeddings = embedder.encode(batch_texts, show_progress_bar=False)
        points = [
            PointStruct(id=str(uuid.uuid4()), vector=emb.tolist(), payload=pay)
            for emb, pay in zip(embeddings, batch_payloads)
        ]
        client.upsert(collection_name=collection_name, points=points)
        indexed += len(points)
    
    logger.info(f"‚úÖ Indexed {indexed} SC judgment chunks")
    return indexed

In [None]:
# Run SC ingestion
# sc_indexed = ingest_sc_judgments()  # Uncomment to run
print("‚è∏Ô∏è SC ingestion skipped (uncomment to run)")

## Summary

This notebook provides a complete offline data pipeline for the Neethi App:

1. **Dataset Download** - Aalap instruction data + Legal texts from HuggingFace
2. **Data Cleaning** - IPC/BNS sections standardization
3. **Chunking** - Create embedding-ready chunks
4. **Qdrant Population** - Index statute chunks
5. **Bail Judgments** - Download and index 1,200 bail cases
6. **SC Judgments** - Extract and index 26K Supreme Court judgments

### Collections Created
- `neethi-legal-kb` - Statute sections (IPC/BNS)
- `neethi-bail-judgments` - Bail judgment cases
- `neethi-judgments` - SC judgments