# Detect Bar Chart or Pie Chart from PDF

This notebook auto-detects **bar charts** and **pie charts** on a PDF page and classifies candidate figure regions as: `bar_chart`, `pie_chart`, or `other`.

### What it does
1. **Rasterize PDF page** to an image (PyMuPDF or pdf2image)
2. **Find candidate figure regions** (non-text, high-graphics areas)
3. **Classify** each region with robust heuristics:
   - **Bar chart**: many tall rectangles aligned to a common baseline (similar widths, vertical edges)
   - **Pie chart**: strong circular structure + multiple radial boundaries meeting at a common center
4. **Export** a CSV of regions & labels and an annotated preview image.

### Optional (advanced)
- **Vector-first hints**: with PyMuPDF, inspect paths/rectangles to boost classification accuracy.

### Requirements
- `pymupdf` (fitz) **or** `pdf2image`
- `opencv-python`, `numpy`, `pandas`
- `matplotlib` (for quick visualization)


## 0) Install (run locally if missing)

In [None]:
# %%bash
# pip install --upgrade pip
# pip install pymupdf pdf2image opencv-python numpy pandas matplotlib
# # For pdf2image, you may need poppler on your OS (e.g., brew install poppler, or apt-get install poppler-utils)

## 1) Inputs
- Set the PDF path and page index.
- Configure output directory and DPI.


In [None]:
from pathlib import Path

PDF_PATH = Path('your_file.pdf')   # <-- change to your PDF
PAGE_INDEX = 0                     # 0-based
OUT_DIR = Path('chart_detect_output')
DPI = 350

OUT_DIR.mkdir(parents=True, exist_ok=True)
print('PDF:', PDF_PATH.resolve())
print('Page:', PAGE_INDEX+1)
print('OUT_DIR:', OUT_DIR.resolve())


## 2) PDF → Image (PyMuPDF preferred)

In [None]:
import importlib

def page_to_image(pdf_path, page_index, out_dir, dpi=350):
    if importlib.util.find_spec('fitz') is not None:
        import fitz
        doc = fitz.open(pdf_path)
        page = doc[page_index]
        mat = fitz.Matrix(dpi/72, dpi/72)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        out = out_dir / f'page_{page_index+1:03d}.png'
        pix.save(out.as_posix())
        return out
    elif importlib.util.find_spec('pdf2image') is not None:
        from pdf2image import convert_from_path
        imgs = convert_from_path(pdf_path, dpi=dpi, first_page=page_index+1, last_page=page_index+1)
        out = out_dir / f'page_{page_index+1:03d}.png'
        imgs[0].save(out)
        return out
    else:
        raise RuntimeError('Install PyMuPDF or pdf2image to rasterize PDF pages.')

PAGE_IMG = page_to_image(PDF_PATH, PAGE_INDEX, OUT_DIR, dpi=DPI)
PAGE_IMG

## 3) Candidate figure region proposal (OpenCV)

In [None]:
import cv2, numpy as np

img = cv2.imread(str(PAGE_IMG))
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
dilated = cv2.dilate(edges, kernel, iterations=2)

# Connected components as candidate non-text regions
cnts,_ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
candidates = []
H, W = gray.shape
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    area = w*h
    # Heuristics to keep graphic-sized regions
    if area > 8000 and w>60 and h>60 and (w/H < 0.95) and (h/H < 0.95):
        candidates.append((x,y,w,h))

def merge_boxes(boxes, pad=10):
    if not boxes: return []
    boxes = sorted(boxes)
    merged=[list((x-pad,y-pad,x+w+pad,y+h+pad)) for (x,y,w,h) in boxes]
    changed=True
    while changed:
        changed=False
        new=[]
        while merged:
            a = merged.pop(0)
            ax0,ay0,ax1,ay1 = a
            overlap=False
            for i,b in enumerate(merged):
                bx0,by0,bx1,by1 = b
                if not (ax1<bx0 or bx1<ax0 or ay1<by0 or by1<ay0):
                    a=[min(ax0,bx0),min(ay0,by0),max(ax1,bx1),max(ay1,by1)]
                    merged.pop(i)
                    overlap=True; changed=True; break
            new.append(a)
        merged=new
    return [tuple(m) for m in merged]

regions_img = merge_boxes(candidates)
len(regions_img), regions_img[:5]


## 4) Region classifiers: bar vs pie vs other (heuristics)

In [None]:
import math
import numpy as np, pandas as pd, cv2

def bar_score(roi):
    g = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    b = cv2.GaussianBlur(g, (3,3), 0)
    t = cv2.threshold(b, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    if (t==0).sum()>(t==255).sum():
        t = cv2.bitwise_not(t)
    t = cv2.morphologyEx(t, cv2.MORPH_CLOSE, cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)), iterations=1)
    cnts,_ = cv2.findContours(t, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bars=[]
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        area=w*h; aspect=h/max(1.0,w)
        if aspect>1.3 and area>200 and h>20:
            bars.append((x,y,w,h))
    if len(bars)<3:
        return 0.0, bars
    bottoms = [y+h for (x,y,w,h) in bars]
    std_baseline = np.std(bottoms) if len(bottoms)>=2 else 999
    score = len(bars) / (1.0 + std_baseline/5.0)
    return float(score), bars

def pie_score(roi):
    g = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    g = cv2.medianBlur(g, 5)
    circles = cv2.HoughCircles(g, cv2.HOUGH_GRADIENT, dp=1.2, minDist=30,
                               param1=120, param2=40, minRadius=20, maxRadius=0)
    if circles is None:
        return 0.0, None, []
    circles = np.uint16(np.around(circles))[0]
    c = max(circles, key=lambda c: c[2])
    cx, cy, r = int(c[0]), int(c[1]), int(c[2])
    edges = cv2.Canny(g, 60, 180)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=60, minLineLength=int(r*0.6), maxLineGap=10)
    radials=[]
    if lines is not None:
        for l in lines[:,0,:]:
            x1,y1,x2,y2 = l
            def point_line_dist(px,py,a,b):
                ax,ay=a; bx,by=b
                lab = math.hypot(bx-ax, by-ay)
                if lab==0:
                    return math.hypot(px-ax, py-ay)
                t = max(0, min(1, ((px-ax)*(bx-ax)+(py-ay)*(by-ay))/(lab*lab)))
                qx = ax + t*(bx-ax); qy = ay + t*(by-ay)
                return math.hypot(px-qx, py-qy)
            d = point_line_dist(cx,cy,(x1,y1),(x2,y2))
            if d < r*0.08:
                radials.append((x1,y1,x2,y2))
    radial_count = len(radials)
    score = 1.0 + radial_count*0.3
    return float(score), (cx,cy,r), radials

labels=[]
annot=img.copy()
for (x0,y0,x1,y1) in regions_img:
    x0=max(0,x0); y0=max(0,y0); x1=min(W-1,x1); y1=min(H-1,y1)
    roi = img[y0:y1, x0:x1]
    bs, bars = bar_score(roi)
    ps, circ, radials = pie_score(roi)
    if ps >= max(1.2, bs*1.2):
        label='pie_chart'; color=(0,165,255)
    elif bs >= max(1.0, ps*1.2):
        label='bar_chart'; color=(0,255,0)
    else:
        label='other'; color=(0,0,255)
    labels.append({'x0':x0,'y0':y0,'x1':x1,'y1':y1,'w':x1-x0,'h':y1-y0,'label':label,'bar_score':bs,'pie_score':ps})
    cv2.rectangle(annot,(x0,y0),(x1,y1),color,2)
    cv2.putText(annot,label,(x0,max(0,y0-5)),cv2.FONT_HERSHEY_SIMPLEX,0.6,color,2,cv2.LINE_AA)

import pandas as pd
df = pd.DataFrame(labels)
CSV_PATH = OUT_DIR / 'regions_classified.csv'
df.to_csv(CSV_PATH, index=False)
CSV_PATH

## 5) Save annotated preview

In [None]:
PREVIEW_PATH = OUT_DIR / 'classified_preview.png'
cv2.imwrite(PREVIEW_PATH.as_posix(), annot)
PREVIEW_PATH

## 6) (Optional) Vector-first hints (PyMuPDF)

In [None]:
import importlib
if importlib.util.find_spec('fitz') is None:
    print('PyMuPDF not installed; skipping vector-first hints.')
else:
    import fitz, numpy as np, pandas as pd
    doc = fitz.open(PDF_PATH)
    page = doc[PAGE_INDEX]
    drawings = page.get_drawings()
    rect_hits = np.zeros((gray.shape[0], gray.shape[1]), dtype=np.uint8)
    circ_hits = np.zeros_like(rect_hits)
    scale = DPI/72.0
    for d in drawings:
        for it in d['items']:
            if it[0] == 'rect':
                x0,y0,x1,y1 = it[1]
                x0=int(x0*scale); y0=int(y0*scale); x1=int(x1*scale); y1=int(y1*scale)
                x0=max(0,min(gray.shape[1]-1,x0)); x1=max(0,min(gray.shape[1]-1,x1))
                y0=max(0,min(gray.shape[0]-1,y0)); y1=max(0,min(gray.shape[0]-1,y1))
                rect_hits[y0:y1,x0:x1]=1
            elif it[0] == 'curve':
                pts = it[1]
                xs=[p[0] for p in pts]; ys=[p[1] for p in pts]
                x0=int(min(xs)*scale); x1=int(max(xs)*scale)
                y0=int(min(ys)*scale); y1=int(max(ys)*scale)
                x0=max(0,min(gray.shape[1]-1,x0)); x1=max(0,min(gray.shape[1]-1,x1))
                y0=max(0,min(gray.shape[0]-1,y0)); y1=max(0,min(gray.shape[0]-1,y1))
                circ_hits[y0:y1,x0:x1]=1
    fused=[]
    for rec in labels:
        x0,y0,x1,y1 = rec['x0'],rec['y0'],rec['x1'],rec['y1']
        r_sum = int(rect_hits[y0:y1,x0:x1].sum())
        c_sum = int(circ_hits[y0:y1,x0:x1].sum())
        rec['rect_hits'] = r_sum
        rec['circ_hits'] = c_sum
        if rec['label']=='other':
            if r_sum>c_sum and r_sum>1000:
                rec['label']='bar_chart'
            elif c_sum>r_sum and c_sum>1000:
                rec['label']='pie_chart'
        fused.append(rec)
    df2 = pd.DataFrame(fused)
    FUSED_CSV = OUT_DIR / 'regions_classified_fused.csv'
    df2.to_csv(FUSED_CSV, index=False)
    FUSED_CSV