# PDF Region Classifier (5 types): Text, Table, Bar, Pie, Image

This notebook detects **five** region types on a PDF page:
- `text`
- `table`
- `bar_chart`
- `pie_chart`
- `image_other` (general image/figure)

It uses both **vector-first** (pdfplumber/PyMuPDF) and **vision-first** (OpenCV) signals and fuses them into final regions.

### Outputs
- `regions_final.csv` — region bounding boxes + labels
- `preview_final.png` — page image with annotated boxes
- `crops/<label>_N.png` — cropped images for each detected region

### Requirements
- `pymupdf` (fitz) **or** `pdf2image` for rasterization
- `opencv-python`, `numpy`, `pandas`
- `pdfplumber` (recommended) for text/objects and optional tables
- (optional) `camelot-py` for table extraction hints

If some libraries are missing, the notebook will fall back to heuristics when possible.

## 0) Install (run locally if needed)

In [None]:
# %%bash
# pip install --upgrade pip
# pip install pymupdf pdf2image opencv-python numpy pandas pdfplumber camelot-py matplotlib
# # OS deps: poppler (for pdf2image), ghostscript (for camelot lattice), Java (for tabula if you choose)

## 1) Inputs

In [None]:
from pathlib import Path

PDF_PATH = Path('your_file.pdf')  # <-- set your PDF
PAGE_INDEX = 0                    # 0-based
DPI = 350
OUT_DIR = Path('region_classifier_output')
OUT_DIR.mkdir(parents=True, exist_ok=True)
(PDF_PATH.resolve(), PAGE_INDEX, OUT_DIR.resolve())

## 2) Rasterize page to image (PyMuPDF preferred)

In [None]:
import importlib

def page_to_image(pdf_path, page_index, out_dir, dpi=350):
    if importlib.util.find_spec('fitz') is not None:
        import fitz
        doc = fitz.open(pdf_path)
        page = doc[page_index]
        mat = fitz.Matrix(dpi/72, dpi/72)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        out = out_dir / f'page_{page_index+1:03d}.png'
        pix.save(out.as_posix())
        return out
    elif importlib.util.find_spec('pdf2image') is not None:
        from pdf2image import convert_from_path
        imgs = convert_from_path(pdf_path, dpi=dpi, first_page=page_index+1, last_page=page_index+1)
        out = out_dir / f'page_{page_index+1:03d}.png'
        imgs[0].save(out)
        return out
    else:
        raise RuntimeError('Install PyMuPDF or pdf2image to rasterize PDF pages.')

PAGE_IMG = page_to_image(PDF_PATH, PAGE_INDEX, OUT_DIR, dpi=DPI)
PAGE_IMG

## 3) Collect vector hints (pdfplumber & PyMuPDF if available)

In [None]:
import importlib
vector = {
    'text_boxes': [],   # list of (x0,y0,x1,y1) in pixel coords
    'image_boxes': [],  # embedded images in PDF
    'rect_hits': None,  # heatmap for rectangles (bar hints)
    'circ_hits': None,  # heatmap for circular-ish paths (pie hints)
}

import cv2, numpy as np
page_img = cv2.imread(str(PAGE_IMG))
H, W = page_img.shape[:2]

scale = DPI/72.0

# pdfplumber text & images
if importlib.util.find_spec('pdfplumber') is not None:
    import pdfplumber
    with pdfplumber.open(PDF_PATH) as pdf:
        page = pdf.pages[PAGE_INDEX]
        words = page.extract_words(use_text_flow=True) or []
        # Merge words into line boxes
        lines = {}
        for w in words:
            yc = (w['top'] + w['bottom'])/2
            key = round(yc/6)*6
            lines.setdefault(key, []).append(w)
        line_boxes = []
        for _, ws in lines.items():
            x0 = min(w['x0'] for w in ws); y0 = min(w['top'] for w in ws)
            x1 = max(w['x1'] for w in ws); y1 = max(w['bottom'] for w in ws)
            # convert PDF pts to pixels; pdfplumber origin top-left already
            vector['text_boxes'].append((int(x0*scale), int(y0*scale), int(x1*scale), int(y1*scale)))
        # Embedded images (figures)
        for im in page.images:
            x0,y0,x1,y1 = im['x0'], im['top'], im['x1'], im['bottom']
            vector['image_boxes'].append((int(x0*scale), int(y0*scale), int(x1*scale), int(y1*scale)))

# PyMuPDF drawings for rect/circle hints
if importlib.util.find_spec('fitz') is not None:
    import fitz
    doc = fitz.open(PDF_PATH)
    page = doc[PAGE_INDEX]
    drawings = page.get_drawings()
    rect_hits = np.zeros((H,W), dtype=np.uint8)
    circ_hits = np.zeros_like(rect_hits)
    for d in drawings:
        for it in d['items']:
            if it[0] == 'rect':
                x0,y0,x1,y1 = it[1]
                x0=int(x0*scale); y0=int(y0*scale); x1=int(x1*scale); y1=int(y1*scale)
                x0=max(0,min(W-1,x0)); x1=max(0,min(W-1,x1))
                y0=max(0,min(H-1,y0)); y1=max(0,min(H-1,y1))
                rect_hits[y0:y1,x0:x1]=1
            elif it[0] == 'curve':
                pts = it[1]
                xs=[p[0] for p in pts]; ys=[p[1] for p in pts]
                x0=int(min(xs)*scale); x1=int(max(xs)*scale)
                y0=int(min(ys)*scale); y1=int(max(ys)*scale)
                x0=max(0,min(W-1,x0)); x1=max(0,min(W-1,x1))
                y0=max(0,min(H-1,y0)); y1=max(0,min(H-1,y1))
                circ_hits[y0:y1,x0:x1]=1
    vector['rect_hits'] = rect_hits
    vector['circ_hits'] = circ_hits

len(vector['text_boxes']), len(vector['image_boxes'])

## 4) Vision-first region proposals (non-text graphics)

In [None]:
import cv2, numpy as np
gray = cv2.cvtColor(page_img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
dilated = cv2.dilate(edges, kernel, iterations=2)
cnts,_ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
candidates=[]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    area=w*h
    if area>8000 and w>60 and h>60 and w<W*0.98 and h<H*0.98:
        candidates.append((x,y,w,h))

def merge_boxes_xyxy(boxes, pad=8):
    if not boxes: return []
    boxes = [ (x-pad,y-pad,x+w+pad,y+h+pad) for (x,y,w,h) in sorted(boxes) ]
    changed=True
    while changed:
        changed=False
        new=[]
        while boxes:
            a = boxes.pop(0)
            ax0,ay0,ax1,ay1=a
            merged=False
            for i,b in enumerate(boxes):
                bx0,by0,bx1,by1=b
                if not (ax1<bx0 or bx1<ax0 or ay1<by0 or by1<ay0):
                    a=(min(ax0,bx0),min(ay0,by0),max(ax1,bx1),max(ay1,by1))
                    boxes.pop(i)
                    changed=True
                    merged=True
                    break
            new.append(a)
        boxes=new
    # clip
    out=[]
    for (x0,y0,x1,y1) in boxes:
        out.append((max(0,x0),max(0,y0),min(W-1,x1),min(H-1,y1)))
    return out

regions_img = merge_boxes_xyxy(candidates)
len(regions_img), regions_img[:5]


## 5) Classifiers: table, bar, pie (vision heuristics) + text/image masks

In [None]:
def detect_tables_mask(gray):
    thr = cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    horiz = 255-thr; vert = 255-thr
    h_scale = max(10, W//60)
    v_scale = max(10, H//60)
    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_scale,1))
    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,v_scale))
    h_lines = cv2.morphologyEx(horiz, cv2.MORPH_OPEN, h_kernel)
    v_lines = cv2.morphologyEx(vert,  cv2.MORPH_OPEN, v_kernel)
    grid = cv2.bitwise_and(h_lines, v_lines)
    return grid

def bar_score(roi):
    g = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    b = cv2.GaussianBlur(g,(3,3),0)
    t = cv2.threshold(b,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    if (t==0).sum()>(t==255).sum():
        t = cv2.bitwise_not(t)
    t = cv2.morphologyEx(t, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)), iterations=1)
    cnts,_ = cv2.findContours(t, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bars=[]
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        aspect = h/max(1.0,w); area=w*h
        if aspect>1.3 and area>200 and h>20: bars.append((x,y,w,h))
    if len(bars)<3: return 0.0, bars
    import numpy as np
    bottoms = [y+h for (x,y,w,h) in bars]
    std_baseline = np.std(bottoms) if len(bottoms)>=2 else 999
    return float(len(bars)/(1+std_baseline/5.0)), bars

def pie_score(roi):
    g = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    g = cv2.medianBlur(g, 5)
    circles = cv2.HoughCircles(g, cv2.HOUGH_GRADIENT, dp=1.2, minDist=30, param1=120, param2=40, minRadius=20, maxRadius=0)
    if circles is None: return 0.0, None, []
    import numpy as np, math
    c = max(np.uint16(np.around(circles))[0], key=lambda z: z[2])
    cx,cy,r = int(c[0]), int(c[1]), int(c[2])
    edges = cv2.Canny(g,60,180)
    lines = cv2.HoughLinesP(edges,1, np.pi/180, threshold=60, minLineLength=int(r*0.6), maxLineGap=10)
    radials=[]
    if lines is not None:
        for l in lines[:,0,:]:
            x1,y1,x2,y2 = l
            def dist_point_line(px,py,a,b):
                import math
                ax,ay=a; bx,by=b
                lab=math.hypot(bx-ax,by-ay)
                if lab==0: return math.hypot(px-ax,py-ay)
                t=max(0,min(1,((px-ax)*(bx-ax)+(py-ay)*(by-ay))/(lab*lab)))
                qx=ax+t*(bx-ax); qy=ay+t*(by-ay)
                return math.hypot(px-qx,py-qy)
            if dist_point_line(cx,cy,(x1,y1),(x2,y2))<r*0.08:
                radials.append((x1,y1,x2,y2))
    score = 1.0 + 0.3*len(radials)
    return float(score), (cx,cy,r), radials

table_mask = detect_tables_mask(gray)
table_cnts,_ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
table_regions = []
for c in table_cnts:
    x,y,w,h = cv2.boundingRect(c)
    if w*h>5000 and w>80 and h>60:
        table_regions.append((x,y,x+w,y+h))

# Prepare text mask from vector text boxes if available
text_mask = np.zeros((H,W), dtype=np.uint8)
for (x0,y0,x1,y1) in vector.get('text_boxes', []):
    cv2.rectangle(text_mask,(x0,y0),(x1,y1),255,-1)

classifications = []
for (x0,y0,x1,y1) in regions_img:
    roi = page_img[y0:y1, x0:x1]
    # Check overlap with table regions
    is_table=False
    for (tx0,ty0,tx1,ty1) in table_regions:
        if not (x1<tx0 or tx1<x0 or y1<ty0 or ty1<y0):
            inter_w = min(x1,tx1)-max(x0,tx0)
            inter_h = min(y1,ty1)-max(y0,ty0)
            if inter_w>10 and inter_h>10:
                is_table=True; break
    if is_table:
        label='table'
    else:
        bs,_ = bar_score(roi)
        ps,_,_ = pie_score(roi)
        # Use vector hints to nudge decisions
        rect_hits = vector.get('rect_hits')
        circ_hits = vector.get('circ_hits')
        rect_sum = int(rect_hits[y0:y1,x0:x1].sum()) if rect_hits is not None else 0
        circ_sum = int(circ_hits[y0:y1,x0:x1].sum()) if circ_hits is not None else 0
        # text coverage ratio
        tm = text_mask[y0:y1,x0:x1]
        text_ratio = tm.mean()/255.0 if tm.size>0 else 0.0
        if text_ratio>0.3:
            label='text'
        else:
            # decide bar vs pie with thresholds + vector hints
            if ps >= max(1.2, bs*1.3) or (circ_sum>rect_sum and circ_sum>1000):
                label='pie_chart'
            elif bs >= max(1.0, ps*1.2) or (rect_sum>circ_sum and rect_sum>1000):
                label='bar_chart'
            else:
                # image_other if it overlaps embedded images or none
                label='image_other'
    classifications.append({'x0':x0,'y0':y0,'x1':x1,'y1':y1,'w':x1-x0,'h':y1-y0,'label':label})

import pandas as pd
df = pd.DataFrame(classifications)
CSV_PATH = OUT_DIR / 'regions_final.csv'
df.to_csv(CSV_PATH, index=False)
CSV_PATH, df['label'].value_counts().to_dict()

## 6) Save annotated preview and per-region crops

In [None]:
annot = page_img.copy()
colors = {
    'text': (255,0,0),        # blue in BGR? OpenCV uses BGR; choose distinct
    'table': (0,255,255),     # yellow
    'bar_chart': (0,255,0),   # green
    'pie_chart': (0,165,255), # orange
    'image_other': (0,0,255)  # red
}
for _,row in df.iterrows():
    x0,y0,x1,y1 = int(row.x0),int(row.y0),int(row.x1),int(row.y1)
    color = colors.get(row.label,(255,255,255))
    cv2.rectangle(annot,(x0,y0),(x1,y1),color,2)
    cv2.putText(annot,row.label,(x0,max(0,y0-5)),cv2.FONT_HERSHEY_SIMPLEX,0.6,color,2,cv2.LINE_AA)

PREVIEW = OUT_DIR / 'preview_final.png'
cv2.imwrite(PREVIEW.as_posix(), annot)

# Save crops
crop_dir = OUT_DIR / 'crops'
crop_dir.mkdir(exist_ok=True)
counts = {}
for _,row in df.iterrows():
    label = row.label
    counts[label] = counts.get(label,0)+1
    x0,y0,x1,y1 = map(int,(row.x0,row.y0,row.x1,row.y1))
    roi = page_img[y0:y1, x0:x1]
    outp = crop_dir / f'{label}_{counts[label]}.png'
    cv2.imwrite(outp.as_posix(), roi)
PREVIEW, crop_dir

## 7) Notes & tuning
- **Priority logic**: tables are identified by gridlines; text is recognized via vector text boxes if available; charts are classified by shape heuristics and nudged by vector hints; remaining regions default to `image_other`.
- If tables are missed, increase line kernel sizes in `detect_tables_mask`.
- If text regions are over/under-detected, rely more/less on vector text boxes or add OCR-based text heatmaps.
- Bar vs pie thresholds (`bar_score` / `pie_score`) can be tuned for your document style.
- For multi-page processing, wrap the pipeline in a loop over page indices.
