In [11]:
# System tools for OCR + PDF rendering
!apt-get -qq update
!apt-get -qq install -y tesseract-ocr poppler-utils
# (Optional for Hindi etc.)  !apt-get -qq install -y tesseract-ocr-hin

# Python libs
!pip -q install pypdf pdf2image pytesseract pandas tqdm pyarrow

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os

folder_path = '/content/drive/MyDrive/Central acts'

pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
print(len(pdf_files))

102


In [12]:
from google.colab import drive
drive.mount('/content/drive')  # if not mounted already

import os, re, pandas as pd
from tqdm import tqdm
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract

# === EDIT THESE ===
BASE_DIR    = "/content/drive/MyDrive/Central acts"  # e.g. "Central acts"
OUTPUT_PATH = "/content/drive/MyDrive/acts_ocr_dataset.csv"     # .csv or .parquet
OCR_LANG    = "eng"        # e.g. "eng+hin" if you installed Hindi pack
OCR_DPI     = 300          # 200–300 is a good balance

# ------------------ helpers ------------------
def normalize_text(s: str) -> str:
    s = (s or "").replace("\x00", " ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def extract_page_text(pdf_path: str, page_index: int) -> tuple[str, bool]:
    """
    Returns (text, ocr_used) for a single page (0-based index).
    - Try embedded text with pypdf
    - If too short, OCR that page with Tesseract
    """
    reader = PdfReader(pdf_path)
    page = reader.pages[page_index]
    txt = normalize_text(page.extract_text() or "")
    if len(txt) >= 15:
        return txt, False

    # OCR this page
    images = convert_from_path(pdf_path, dpi=OCR_DPI,
                               first_page=page_index+1, last_page=page_index+1)
    if images:
        ocr_txt = pytesseract.image_to_string(images[0], lang=OCR_LANG)
        return normalize_text(ocr_txt), True
    return txt, False

def process_pdf(pdf_path: str, root: str) -> list[dict]:
    """Return rows for each page of this PDF."""
    rows = []
    try:
        reader = PdfReader(pdf_path)
        n_pages = len(reader.pages)
    except Exception as e:
        print(f"[SKIP] Failed to read: {pdf_path} -> {e}")
        return rows

    rel = os.path.relpath(pdf_path, root)
    name = os.path.basename(pdf_path)
    for i in range(n_pages):
        try:
            text, used_ocr = extract_page_text(pdf_path, i)
        except Exception as e:
            print(f"[WARN] OCR error {pdf_path} page {i+1}: {e}")
            text, used_ocr = "", False
        rows.append({
            "file_path": pdf_path,
            "relative_path": rel,
            "file_name": name,
            "page": i+1,
            "text": text,
            "ocr_used": used_ocr
        })
    return rows

# ------------------ run ------------------
# gather PDFs recursively
pdfs = []
for root, dirs, files in os.walk(BASE_DIR):
    for f in files:
        if f.lower().endswith(".pdf"):
            pdfs.append(os.path.join(root, f))

print(f"Found {len(pdfs)} PDFs under: {BASE_DIR}")

all_rows = []
for pdf in tqdm(pdfs, desc="Processing PDFs"):
    all_rows.extend(process_pdf(pdf, BASE_DIR))

df = pd.DataFrame(all_rows, columns=["file_path","relative_path","file_name","page","text","ocr_used"])
if OUTPUT_PATH.lower().endswith(".parquet"):
    df.to_parquet(OUTPUT_PATH, index=False)
else:
    df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved {len(df)} rows → {OUTPUT_PATH}")
display(df.head(10))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 102 PDFs under: /content/drive/MyDrive/Central acts


Processing PDFs: 100%|██████████| 102/102 [04:18<00:00,  2.54s/it]


Saved 1432 rows → /content/drive/MyDrive/acts_ocr_dataset.csv


Unnamed: 0,file_path,relative_path,file_name,page,text,ocr_used
0,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,1,1 ALPHABETICAL LIST OF CENTRAL ACTS S.No Name ...,False
1,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,2,2 48. Arya Marriage Validation Act 1937 19 49....,False
2,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,3,3 97. Bombay Revenue Jurisdiction Act 1876 10 ...,False
3,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,4,4 145. Chandigarh Disturbed Areas Act 1983 33 ...,False
4,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,5,5 192. Competition Act 2003 12 193. Comptrolle...,False
5,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,6,6 239. Dentists Act 1948 16 240. Deo Estate Ac...,False
6,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,7,7 286. The Essential Defence Services Act 2021...,False
7,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,8,8 334. Government Securities Act 2006 38 335. ...,False
8,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,9,9 379. Indian Maritime University Act 2008 22 ...,False
9,/content/drive/MyDrive/Central acts/List of al...,List of all central acts.pdf,List of all central acts.pdf,10,10 425. Interest-tax Act 1974 45 426. Internat...,False


In [14]:
df.to_csv('/content/drive/MyDrive/Central acts/acts_dataset.csv')

In [23]:
sample_df = df[df['file_name']=="Aadhaar Act, 2016.pdf"]
sample_df.text.to_list()[0]

'THE AADHAAR (TARGETED DELIVERY OF FINANCIAL AND OTHER SUBSIDIES, BENEFITS AND SERVICES) ACT, 2016 NO. 18 OF 2016 [25th March, 2016.] An Act to provide for, as a good governance, efficient, transparent, and targeted delivery of subsidies, benefits and services, the expenditure for which is incurred from the Consolidated Fund of India, to individuals residing in India through assigning of unique identity numbers to such individuals and for matters connected therewith or incidental thereto. BE it enacted by Parliament in the Sixty-seventh Year of the Republic of India as follows:— CHAPTER I PRELIMINARY 1. (1) This Act may be called the Aadhaar (Targeted Delivery of Financial and Other Subsidies, Benefits and Services) Act, 2016. (2) It shall extend to the whole of India except the State of Jammu and Kashmir and save as otherwise provided in this Act, it shall also apply to any offence or contravention thereunder committed outside India by any person. (3) It shall come into force on such 

In [24]:
# ========== FINAL CODE: Split OCR text into Chapters & Sections and Save ==========
import re
import pandas as pd
from typing import List, Tuple

# ---------- Cleaning ----------
def normalize_spaces(s: str) -> str:
    """Basic clean-up: remove nulls, fix hyphenation like 'commen- cement' → 'commencement',
    collapse spaces, keep paragraph breaks."""
    s = (s or "").replace("\x00", " ")
    s = re.sub(r"(\w)-\s+(\w)", r"\1\2", s)         # fix split words across lines
    s = re.sub(r"[ \t\r\f\v]+", " ", s)             # collapse spaces
    s = re.sub(r"\n\s*\n+", "\n\n", s)              # normalize blank lines
    return s.strip()

# ---------- Patterns ----------
CHAPTER_RE = re.compile(r"(?im)^\s*CHAPTER\s+([IVXLCDM]+|\d+)\s*(.*)$")  # e.g., CHAPTER I PRELIMINARY
SECTION_RE = re.compile(r"(?m)^\s*(\d+)\.\s")                            # e.g., "1. ", "2. " at line start

# ---------- Find chapters ----------
def find_chapters(text: str) -> List[Tuple[int, int, str, str]]:
    """
    Returns list of chapter spans: (start_idx, end_idx, chapter_no, chapter_title).
    If no chapters found, returns one pseudo-chapter covering full text.
    """
    matches = list(CHAPTER_RE.finditer(text))
    spans = []
    if not matches:
        spans.append((0, len(text), "0", "Preamble"))
        return spans

    for i, m in enumerate(matches):
        start = m.start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        chap_no = m.group(1).strip()
        chap_title = (m.group(2) or "").strip()

        # Some Acts put the title on the next line in UPPER CASE. Try to fetch it.
        if not chap_title:
            after = text[m.end(): m.end()+160].splitlines()
            if after:
                nxt = after[0].strip()
                if 2 <= len(nxt) <= 100 and nxt.upper() == nxt:
                    chap_title = nxt
        spans.append((start, end, chap_no, chap_title))
    return spans

# ---------- Split sections inside a chapter ----------
def split_sections(chapter_text: str) -> List[Tuple[str, str]]:
    """Split by numbered headings like '1.', '2.' at line starts. Returns (section_no, block)."""
    secs = []
    iters = list(SECTION_RE.finditer(chapter_text))
    if not iters:
        return secs
    for i, m in enumerate(iters):
        start = m.start()
        end = iters[i+1].start() if i+1 < len(iters) else len(chapter_text)
        sec_no = m.group(1)
        block = chapter_text[start:end].strip()
        secs.append((sec_no, block))
    return secs

# ---------- Try to extract a short section heading ----------
def extract_heading_from_block(block: str) -> str:
    """
    After 'N. ' grab a short heading.
    Priority: text before an em dash '—' (common in Gazette), or first line, or first sentence.
    """
    m = re.match(r"^\s*\d+\.\s*(.*)$", block, flags=re.S)
    rest = m.group(1).lstrip() if m else block

    # 1) Stop at em dash if present early
    dash = re.search(r"—|-{2,}", rest)
    if dash and dash.start() < 160:
        heading = rest[:dash.start()].strip()
        heading = re.sub(r"\s+", " ", heading)
        return heading[:160]

    # 2) First line
    first_line = rest.splitlines()[0].strip() if rest.splitlines() else rest.strip()
    if 3 < len(first_line) < 160:
        return re.sub(r"\s+", " ", first_line)

    # 3) Up to first period
    dot = rest.find(".")
    if 0 < dot < 160:
        return re.sub(r"\s+", " ", rest[:dot])

    return ""

# ---------- Main: parse one Act (full text) ----------
def rows_from_text(act_title: str, text: str) -> pd.DataFrame:
    text = normalize_spaces(text)
    rows = []
    for start, end, chap_no, chap_title in find_chapters(text):
        chap_block = text[start:end].strip()
        sections = split_sections(chap_block)
        if not sections:
            # No numbered sections found: store whole chapter
            rows.append({
                "act_title": act_title,
                "chapter_no": chap_no,
                "chapter_title": chap_title,
                "section_no": None,
                "section_heading": None,
                "section_text": chap_block
            })
            continue
        for sec_no, block in sections:
            heading = extract_heading_from_block(block)
            rows.append({
                "act_title": act_title,
                "chapter_no": chap_no,
                "chapter_title": chap_title,
                "section_no": sec_no,
                "section_heading": heading,
                "section_text": block
            })
    return pd.DataFrame(rows, columns=[
        "act_title","chapter_no","chapter_title","section_no","section_heading","section_text"
    ])

# ---------- If you have a page-level OCR DataFrame ----------
def parse_all_acts_from_page_df(page_df: pd.DataFrame,
                                title_col: str = "file_name",
                                text_col: str = "text",
                                page_col: str = "page") -> pd.DataFrame:
    """
    page_df: one row per page, with at least [file_name, page, text]
    Groups pages by file_name, concatenates them in page order, and parses chapters/sections.
    """
    all_rows = []
    # ensure proper ordering
    df_sorted = page_df.copy()
    if page_col in df_sorted.columns:
        df_sorted = df_sorted.sort_values([title_col, page_col])

    for name, g in df_sorted.groupby(title_col, dropna=False):
        full_text = "\n\n".join([str(t) for t in g[text_col].tolist()])
        parsed = rows_from_text(str(name), full_text)
        all_rows.append(parsed)
    return pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame(columns=[
        "act_title","chapter_no","chapter_title","section_no","section_heading","section_text"
    ])

# ================= USAGE =================
# (A) Single full-text string (your pasted Aadhaar example)
# full_text = """<paste your full OCR text here>"""
# df_sections = rows_from_text("Aadhaar Act, 2016", full_text)
# df_sections.to_csv("/content/drive/MyDrive/aadhaar_sections.csv", index=False)  # or .parquet

# (B) From your earlier OCR dataset (one row per page)
# page_df = pd.read_parquet("/content/drive/MyDrive/acts_ocr_dataset.parquet")  # or read_csv(...)
# df_sections = parse_all_acts_from_page_df(page_df, title_col="file_name", text_col="text", page_col="page")
# df_sections.to_parquet("/content/drive/MyDrive/acts_sections.parquet", index=False)

# Quick peek:
# df_sections.head(10)


In [29]:
page_df = df
df_sections = parse_all_acts_from_page_df(page_df, title_col="file_name", text_col="text", page_col="page")
df_sections.to_csv('/content/drive/MyDrive/Central acts/acts_sections.csv', index=False)

102