
# PDF Tables & Charts Extraction — Layout‑First Pipeline (Skeleton)

This notebook provides a **robust + fast** starting point to detect **tables** and **charts (bar/pie/line)** from PDF pages and extract structured data for reconstruction.

**Design highlights**
- Single low‑DPI render for **layout detection**, then high‑DPI crops only where needed.
- **Tables**: Table‑structure model (stub for Table Transformer / TATR) → fallback to **Camelot** (lattice → stream).
- **Charts**: chart **classifier stub** + **chart‑to‑data stub** (plug in **DePlot** / **ChartOCR**).
- **Text mapping** with `pdfplumber` if a native text layer exists (OCR omitted here).

> ⚠️ Heavy ML parts are **stubs**. You can wire your models where marked.


In [None]:

# --- Setup (uncomment if needed) ---
# %pip install -q pymupdf pdfplumber camelot-py
# NOTE: Camelot may require system deps (ghostscript, tk). On Ubuntu:
# sudo apt-get update && sudo apt-get install -y ghostscript python3-tk

import io, os, json
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Optional, Tuple

try:
    import fitz  # PyMuPDF
except Exception as e:
    print("PyMuPDF not available:", e)
    fitz = None

try:
    import pdfplumber
except Exception as e:
    print("pdfplumber not available:", e)
    pdfplumber = None

try:
    import camelot
except Exception as e:
    print("Camelot not available:", e)
    camelot = None

# Optional: layoutparser (Detectron2). Left out by default to keep notebook lightweight.
try:
    import layoutparser as lp  # requires detectron2 for DL models
except Exception as e:
    print("layoutparser not available:", e)
    lp = None


In [None]:

@dataclass
class Region:
    kind: str  # "text" | "table" | "figure"
    bbox: Tuple[int, int, int, int]  # x1,y1,x2,y2
    score: float = 1.0

@dataclass
class TableResult:
    bbox: Tuple[int, int, int, int]
    cells: List[List[str]]
    csv: Optional[str] = None

@dataclass
class ChartResult:
    bbox: Tuple[int, int, int, int]
    type: str
    confidence: float
    series: List[Dict[str, Any]]

@dataclass
class PageResult:
    page_num: int
    tables: List[TableResult]
    charts: List[ChartResult]
    meta: Dict[str, Any]


In [None]:

def render_page(pdf_path: str, page_num: int, dpi: int = 180):
    if fitz is None:
        raise RuntimeError("PyMuPDF (fitz) not installed. Run: %pip install pymupdf")
    doc = fitz.open(pdf_path)
    if page_num < 0 or page_num >= len(doc):
        raise IndexError(f"Page {page_num} out of range (0..{len(doc)-1})")
    page = doc[page_num]
    mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)  # 72 DPI base
    pix = page.get_pixmap(matrix=mat, alpha=False)  # RGB
    img_bytes = pix.tobytes("png")
    doc.close()
    return img_bytes, pix.width, pix.height

def render_crop(pdf_path: str, page_num: int, bbox: Tuple[int, int, int, int], dpi: int = 240):
    if fitz is None:
        raise RuntimeError("PyMuPDF (fitz) not installed. Run: %pip install pymupdf")
    x1, y1, x2, y2 = bbox
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
    rect = fitz.Rect(x1, y1, x2, y2)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
    img_bytes = pix.tobytes("png")
    doc.close()
    return img_bytes, pix.width, pix.height


In [None]:

def detect_layout_regions(image_bytes: bytes, width: int, height: int) -> List[Region]:
    """Return Region list. If layoutparser is unavailable, fallback returns the full page as 'figure'."""
    regions: List[Region] = []
    if lp is not None:
        try:
            # Example: switch to your actual model (PubLayNet/DocLayNet)
            # model = lp.Detectron2LayoutModel(
            #     config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
            #     label_map={0:"text",1:"title",2:"list",3:"table",4:"figure"},
            #     extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6]
            # )
            # import numpy as np, cv2
            # nparr = np.frombuffer(image_bytes, np.uint8)
            # image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            # layout = model.detect(image)
            # for l in layout:
            #     label = l.type if l.type in ("text","table","figure") else "text"
            #     x1, y1, x2, y2 = map(int, l.coordinates)
            #     regions.append(Region(kind=label, bbox=(x1,y1,x2,y2), score=float(l.score)))
            pass
        except Exception as e:
            print("Layout detection error:", e)

    if not regions:
        # Fallback: full page as a figure region (so pipeline still runs)
        regions.append(Region(kind="figure", bbox=(0, 0, width, height), score=0.1))
    return regions


In [None]:

class TableStructureModel:
    def __init__(self):
        # TODO: load your DETR/TATR weights (e.g., via transformers). Set ready=True when loaded.
        self.ready = False

    def infer(self, image_bytes) -> Dict[str, Any]:
        # Return example structure: {"cells": [["A","1"],["B","2"]], "bboxes": [[x1,y1,x2,y2], ...]}
        return {"cells": [], "bboxes": []}

def rows_to_csv(rows: List[List[str]]) -> str:
    from io import StringIO
    import csv
    out = StringIO()
    w = csv.writer(out)
    for r in rows:
        w.writerow(r)
    return out.getvalue()

def bbox_to_camelot(bbox: Tuple[int,int,int,int]) -> str:
    x1, y1, x2, y2 = bbox
    # NOTE: Camelot expects PDF coordinate space with origin at bottom-left; you might need coord mapping.
    return f"{x1},{y1},{x2},{y2}"

def extract_table_from_region(pdf_path: str, page_num: int, region: Region, prefer_tatr: bool = True) -> Optional[TableResult]:
    # 1) Try TATR-like structure model
    tsm = TableStructureModel()
    if prefer_tatr and tsm.ready:
        img_bytes, _, _ = render_crop(pdf_path, page_num, region.bbox, dpi=240)
        structure = tsm.infer(img_bytes)
        cells = structure.get("cells", [])
        csv_text = rows_to_csv(cells)
        return TableResult(bbox=region.bbox, cells=cells, csv=csv_text)

    # 2) Fallback: Camelot (lattice -> stream)
    if camelot is not None:
        try:
            tables = camelot.read_pdf(
                filepath=pdf_path,
                pages=str(page_num + 1),  # Camelot is 1-based
                flavor="lattice",
                table_areas=[bbox_to_camelot(region.bbox)]
            )
            if tables.n == 0:
                tables = camelot.read_pdf(
                    filepath=pdf_path,
                    pages=str(page_num + 1),
                    flavor="stream",
                    table_areas=[bbox_to_camelot(region.bbox)],
                    strip_text="\n"
                )
            if tables.n > 0:
                df = tables[0].df
                cells = df.values.tolist()
                csv_text = df.to_csv(index=False, header=False)
                return TableResult(bbox=region.bbox, cells=cells, csv=csv_text)
        except Exception as e:
            print("Camelot extraction error:", e)

    # 3) Nothing found
    return TableResult(bbox=region.bbox, cells=[], csv="")


In [None]:

class ChartClassifier:
    def __init__(self):
        # TODO: load your small CNN/ViT/YOLO model; set ready=True when loaded.
        self.ready = False

    def predict_type(self, image_bytes) -> Tuple[str, float]:
        # Return ("bar"|"pie"|"line"|"other", confidence)
        return ("other", 0.0)

class ChartToDataModel:
    def __init__(self):
        # TODO: wire DePlot or ChartOCR; set ready=True when loaded.
        self.ready = False

    def extract(self, image_bytes, chart_type: str) -> Dict[str, Any]:
        # Return canonical series format
        return {"type": chart_type, "series": [], "meta": {}}

def extract_chart_from_region(pdf_path: str, page_num: int, region: Region) -> Optional[ChartResult]:
    img_bytes, _, _ = render_crop(pdf_path, page_num, region.bbox, dpi=240)
    classifier = ChartClassifier()
    ctype, conf = classifier.predict_type(img_bytes)

    extractor = ChartToDataModel()
    data = extractor.extract(img_bytes, ctype)

    return ChartResult(
        bbox=region.bbox,
        type=data.get("type", ctype),
        confidence=conf,
        series=data.get("series", [])
    )


In [None]:

def inside(box, region):
    x0, y0, x1, y1 = box
    rx0, ry0, rx1, ry1 = region
    return (x0 >= rx0) and (y0 >= ry0) and (x1 <= rx1) and (y1 <= ry1)

def extract_text_in_bbox(pdf_path: str, page_num: int, bbox: Tuple[int,int,int,int]) -> str:
    if pdfplumber is None:
        return ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_num]
            x0, y0, x1, y1 = bbox
            words = page.extract_words(keep_blank_chars=False, use_text_flow=True)
            texts = [w["text"] for w in words if inside((w["x0"], w["top"], w["x1"], w["bottom"]), (x0,y0,x1,y1))]
            return " ".join(texts)
    except Exception as e:
        print("pdfplumber text mapping error:", e)
        return ""


In [None]:

def process_pdf(pdf_path: str, pages: Optional[List[int]] = None, layout_dpi: int = 180) -> Dict[str, Any]:
    if fitz is None:
        raise RuntimeError("PyMuPDF (fitz) not installed. Run: %pip install pymupdf")
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()

    if pages is None:
        pages = list(range(total_pages))

    all_pages = []

    for p in pages:
        img_bytes, w, h = render_page(pdf_path, p, dpi=layout_dpi)
        regions = detect_layout_regions(img_bytes, w, h)

        tables, charts = [], []

        for r in regions:
            if r.kind == "table":
                t_res = extract_table_from_region(pdf_path, p, r, prefer_tatr=True)
                if t_res: tables.append(asdict(t_res))
            elif r.kind == "figure":
                c_res = extract_chart_from_region(pdf_path, p, r)
                if c_res: charts.append(asdict(c_res))
            else:
                # ignore text blocks here
                pass

        all_pages.append({
            "page": p,
            "meta": {"width": w, "height": h},
            "tables": tables,
            "charts": charts
        })

    return {"document": os.path.basename(pdf_path), "pages": all_pages}



## Demo

1. Upload a PDF to this environment or reference a path (e.g., `/mnt/data/your.pdf`).  
2. (Optional) Uncomment the `%pip install` line in the setup cell if modules are missing.  
3. Run the cell below to process the PDF.  
4. Inspect the returned JSON; use it to **recreate** tables (CSV/HTML) and charts (matplotlib/Altair).

> Tip: Start with **layout DPI = 180**. Increase **crop DPI to 240–300** inside the extractors if needed.


In [None]:

# --- Run the pipeline on your PDF ---
pdf_path = "/mnt/data/your.pdf"   # <-- replace with your file
pages = None                      # e.g., [0, 2] for specific pages
result = process_pdf(pdf_path, pages=pages, layout_dpi=180)
print(json.dumps(result, indent=2, ensure_ascii=False))



## (Optional) Recreate Outputs

Below are tiny helpers to recreate:
- **Tables** → CSV/HTML
- **Charts** → matplotlib from the `series` data


In [None]:

import pandas as pd
import matplotlib.pyplot as plt

def save_table_csv(table_obj, path):
    import csv
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        for row in table_obj.get("cells", []):
            w.writerow(row)
    return path

def plot_chart_from_series(chart_obj):
    ctype = chart_obj.get("type", "other")
    series = chart_obj.get("series", [])
    if ctype == "bar" and series:
        # Simple single-series bar demo
        s = series[0]
        xs = s.get("x", [])
        ys = s.get("y", [])
        plt.figure()
        plt.bar(xs, ys)
        plt.title("Recreated Bar Chart")
        plt.show()
    elif ctype == "pie" and series:
        s = series[0]
        labels = s.get("x", [])
        vals = s.get("y", [])
        plt.figure()
        plt.pie(vals, labels=labels, autopct="%1.1f%%")
        plt.title("Recreated Pie Chart")
        plt.show()
    else:
        print("No supported chart data to plot (need 'type' and 'series').")
