In [9]:
import pdfplumber
def extract_judgment_text(pdf_path: str) -> str:
    full_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text.append(text)

    return "\n".join(full_text)

In [2]:
import re

def clean_judgment_text(text: str) -> str:
    # Remove page numbers like "623", "624"
    text = re.sub(r"\n\s*\d+\s*\n", "\n", text)

    # Remove repeated site references
    text = re.sub(r"Indian Kanoon.*\n", "", text)

    # Remove multiple spaces
    text = re.sub(r"[ \t]+", " ", text)

    # Fix broken newlines
    text = re.sub(r"\n{2,}", "\n\n", text)

    return text.strip()


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_judgment(text: str):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " "]
    )
    return splitter.split_text(text)


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import os
COURT_DATA_DIR = "court-data"

all_chunks = []

for filename in os.listdir(COURT_DATA_DIR):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(COURT_DATA_DIR, filename)

        print(f"Processing: {filename}")

        raw_text = extract_judgment_text(pdf_path)
        clean_text = clean_judgment_text(raw_text)
        chunks = chunk_judgment(clean_text)

        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "text": chunk,
                "metadata": {
                    "case_name": filename.replace(".pdf", ""),
                    "chunk_id": i
                }
            })

print(f"\n✅ Total chunks created: {len(all_chunks)}")


Processing: Gidder_Singh_And_Another_vs_State_Of_Punjab_on_9_November_2009.PDF
Processing: Justice_K_S_Puttaswamy_Retd_And_Anr_vs_Union_Of_India_And_Ors_on_24_August_2017.PDF
Processing: Kapleshwar_Paswan_Ors_vs_State_Of_Bihar_Anr_on_16_August_2011 (1).PDF
Processing: Kesavananda_Bharati_Sripadagalvaru_vs_State_Of_Kerala_And_Anr_on_24_April_1973.PDF


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss 

Processing: K_Nandakumar_vs_The_Director_Of_Collegiate_Education_on_29_August_2008.PDF
Processing: Management_Of_Kattabomman_Transport_vs_P_Sundaram_And_Anr_on_15_December_2004.PDF
Processing: Maneka_Gandhi_vs_Union_Of_India_on_25_January_1978.PDF
Processing: Shreya_Singhal_vs_U_O_I_on_24_March_2015.PDF
Processing: Vishaka_Ors_vs_State_Of_Rajasthan_Ors_on_13_August_1997.PDF

✅ Total chunks created: 2026


In [11]:
import json
import os

os.makedirs("processed-data", exist_ok=True)

with open("processed-data/judgment_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(all_chunks)} chunks to processed-data/judgment_chunks.json")


✅ Saved 2026 chunks to processed-data/judgment_chunks.json


In [12]:
all_chunks[0]


{'text': "Gidder Singh And Another vs State Of Punjab on 9 November, 2009\nGidder Singh And Another vs State Of Punjab on 9 November,\nAuthor: Ram Chand Gupta\nBench: Ram Chand Gupta\nCrl. Revision No.2592 of 2004 -1-\nIN THE HIGH COURT OF PUNJAB AND HARYANA AT\nCHANDIGARH\nCRIMINAL REVISION No. 2592 OF 2004.\nDATE OF DECISION : 9 -11-2009.\nGidder Singh and another.\n...... PETITIONERS\nVersus\nState of Punjab.\n..... RESPONDENT\nCORAM:- HON'BLE MR.JUSTICE RAM CHAND GUPTA\nPresent: Mr. Jagpal Singh, Advocate\nfor the petitioners.\nMr. Jaspreet Singh, Asstt. Advocate General, Punjab.\n***\nRAM CHAND GUPTA, J.\nThis revision petition is directed against the judgment dated 01.12.2004 rendered by the court of\nAdditional Sessions Judge, Muktsar, vide which it dismissed the appeal against the judgment of\nconviction and order of sentence dated 31.01.2002 rendered by the court of Chief Judicial\nMagistrate, Muktsar, convicting and sentencing the present revision- petitioners as under:-",
 '