# PDF Region → Extract → Recreate (Text • Table • Bar • Pie • Image)

This notebook:
1) **Detects** five region types on a PDF page: `text`, `table`, `bar_chart`, `pie_chart`, `image_other`.
2) **Extracts** content from each region:
   - Text → TXT
   - Table → CSV
   - Bar chart → values + optional x labels (OCR) + recreated chart (PNG)
   - Pie chart → slice percentages (+ optional labels) + recreated chart (PNG)
   - Images → PNG crops
3) **Recreates** content and (optionally) builds a PPTX with everything.

It uses both vector-first (pdfplumber/PyMuPDF) and computer-vision (OpenCV + OCR) approaches and falls back gracefully if a method isn't available.


## 0) Install (run locally if missing)

In [None]:
# %%bash
# pip install --upgrade pip
# pip install pymupdf pdf2image opencv-python pdfplumber pytesseract pandas numpy matplotlib python-pptx camelot-py
# # OS deps:
# # macOS: brew install tesseract ghostscript poppler
# # Ubuntu/Debian: sudo apt-get update && sudo apt-get install -y tesseract-ocr ghostscript poppler-utils default-jre


## 1) Inputs

In [None]:
from pathlib import Path

PDF_PATH = Path('your_file.pdf')  # <-- set your PDF
PAGE_INDEX = 0                    # 0-based page index
DPI = 400
OUT_DIR = Path('pdf_extract_recreate_output')
OUT_DIR.mkdir(parents=True, exist_ok=True)
PDF_PATH.resolve(), PAGE_INDEX, OUT_DIR.resolve()

## 2) Rasterize page (PyMuPDF preferred)

In [None]:
import importlib

def page_to_image(pdf_path, page_index, out_dir, dpi=400):
    if importlib.util.find_spec('fitz') is not None:
        import fitz
        doc = fitz.open(pdf_path)
        page = doc[page_index]
        mat = fitz.Matrix(dpi/72, dpi/72)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        out = out_dir / f'page_{page_index+1:03d}.png'
        pix.save(out.as_posix())
        return out
    elif importlib.util.find_spec('pdf2image') is not None:
        from pdf2image import convert_from_path
        imgs = convert_from_path(pdf_path, dpi=dpi, first_page=page_index+1, last_page=page_index+1)
        out = out_dir / f'page_{page_index+1:03d}.png'
        imgs[0].save(out)
        return out
    else:
        raise RuntimeError('Install PyMuPDF or pdf2image to rasterize PDF pages.')

PAGE_IMG = page_to_image(PDF_PATH, PAGE_INDEX, OUT_DIR, dpi=DPI)
PAGE_IMG

## 3) Collect vector hints (text boxes, embedded images, drawings)

In [None]:
import importlib, cv2, numpy as np
page_bgr = cv2.imread(str(PAGE_IMG))
H, W = page_bgr.shape[:2]
scale = DPI/72.0

vector = {'text_boxes':[], 'image_boxes':[], 'rect_hits':None, 'circ_hits':None}

# pdfplumber text+images
if importlib.util.find_spec('pdfplumber') is not None:
    import pdfplumber
    with pdfplumber.open(PDF_PATH) as pdf:
        page = pdf.pages[PAGE_INDEX]
        words = page.extract_words(use_text_flow=True) or []
        lines = {}
        for w in words:
            yc = (w['top'] + w['bottom'])/2
            key = round(yc/6)*6
            lines.setdefault(key, []).append(w)
        for _, ws in lines.items():
            x0 = min(w['x0'] for w in ws); y0 = min(w['top'] for w in ws)
            x1 = max(w['x1'] for w in ws); y1 = max(w['bottom'] for w in ws)
            vector['text_boxes'].append((int(x0*scale), int(y0*scale), int(x1*scale), int(y1*scale)))
        for im in page.images:
            x0,y0,x1,y1 = im['x0'], im['top'], im['x1'], im['bottom']
            vector['image_boxes'].append((int(x0*scale), int(y0*scale), int(x1*scale), int(y1*scale)))

# PyMuPDF drawings
if importlib.util.find_spec('fitz') is not None:
    import fitz
    doc = fitz.open(PDF_PATH)
    page = doc[PAGE_INDEX]
    drawings = page.get_drawings()
    rect_hits = np.zeros((H,W), dtype=np.uint8)
    circ_hits = np.zeros_like(rect_hits)
    for d in drawings:
        for it in d['items']:
            if it[0]=='rect':
                x0,y0,x1,y1 = it[1]
                x0=int(x0*scale); x1=int(x1*scale)
                y0=int(y0*scale); y1=int(y1*scale)
                x0=max(0,min(W-1,x0)); x1=max(0,min(W-1,x1))
                y0=max(0,min(H-1,y0)); y1=max(0,min(H-1,y1))
                rect_hits[y0:y1,x0:x1]=1
            elif it[0]=='curve':
                pts = it[1]
                xs=[p[0] for p in pts]; ys=[p[1] for p in pts]
                x0=int(min(xs)*scale); x1=int(max(xs)*scale)
                y0=int(min(ys)*scale); y1=int(max(ys)*scale)
                x0=max(0,min(W-1,x0)); x1=max(0,min(W-1,x1))
                y0=max(0,min(H-1,y0)); y1=max(0,min(H-1,y1))
                circ_hits[y0:y1,x0:x1]=1
    vector['rect_hits']=rect_hits; vector['circ_hits']=circ_hits

len(vector['text_boxes']), len(vector['image_boxes'])

## 4) Vision proposals + 5-type classification

In [None]:
import cv2, numpy as np, pandas as pd, math

gray = cv2.cvtColor(page_bgr, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150)
dil = cv2.dilate(edges, cv2.getStructuringElement(cv2.MORPH_RECT,(5,5)), iterations=2)
cnts,_ = cv2.findContours(dil, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cands=[]
for c in cnts:
    x,y,w,h=cv2.boundingRect(c); area=w*h
    if area>8000 and w>60 and h>60 and w<W*0.98 and h<H*0.98:
        cands.append((x,y,w,h))

def merge_xyxy(boxes, pad=8):
    if not boxes: return []
    boxes=[(x-pad,y-pad,x+w+pad,y+h+pad) for (x,y,w,h) in sorted(boxes)]
    changed=True
    while changed:
        changed=False; new=[]
        while boxes:
            a=boxes.pop(0); ax0,ay0,ax1,ay1=a
            merged=False
            for i,b in enumerate(boxes):
                bx0,by0,bx1,by1=b
                if not (ax1<bx0 or bx1<ax0 or ay1<by0 or by1<ay0):
                    a=(min(ax0,bx0),min(ay0,by0),max(ax1,bx1),max(ay1,by1))
                    boxes.pop(i); changed=True; merged=True; break
            new.append(a)
        boxes=new
    out=[]
    for (x0,y0,x1,y1) in boxes:
        out.append((max(0,x0),max(0,y0),min(W-1,x1),min(H-1,y1)))
    return out

regions = merge_xyxy(cands)

def table_mask(gray):
    thr=cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    h=255-thr; v=255-thr
    h_kernel=cv2.getStructuringElement(cv2.MORPH_RECT,(max(10,W//60),1))
    v_kernel=cv2.getStructuringElement(cv2.MORPH_RECT,(1,max(10,H//60)))
    h_lines=cv2.morphologyEx(h, cv2.MORPH_OPEN, h_kernel)
    v_lines=cv2.morphologyEx(v, cv2.MORPH_OPEN, v_kernel)
    return cv2.bitwise_and(h_lines, v_lines)

def bar_score(roi):
    g=cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY)
    t=cv2.threshold(cv2.GaussianBlur(g,(3,3),0),0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    if (t==0).sum()>(t==255).sum(): t=cv2.bitwise_not(t)
    t=cv2.morphologyEx(t,cv2.MORPH_CLOSE,cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)),iterations=1)
    cnts,_=cv2.findContours(t,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    bars=[]
    for c in cnts:
        x,y,w,h=cv2.boundingRect(c)
        if h/max(1.0,w)>1.3 and w*h>200 and h>20: bars.append((x,y,w,h))
    if len(bars)<3: return 0.0, bars
    import numpy as np
    bottoms=[y+h for (x,y,w,h) in bars]
    std=np.std(bottoms) if len(bottoms)>1 else 999
    return float(len(bars)/(1+std/5.0)), bars

def pie_score(roi):
    g=cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY)
    g=cv2.medianBlur(g,5)
    circles=cv2.HoughCircles(g,cv2.HOUGH_GRADIENT,dp=1.2,minDist=30,param1=120,param2=40,minRadius=20,maxRadius=0)
    if circles is None: return 0.0,None,[]
    import numpy as np
    c=max(np.uint16(np.around(circles))[0], key=lambda z:z[2])
    cx,cy,r=int(c[0]),int(c[1]),int(c[2])
    edges=cv2.Canny(g,60,180)
    lines=cv2.HoughLinesP(edges,1,np.pi/180,threshold=60,minLineLength=int(r*0.6),maxLineGap=10)
    radials=[]
    if lines is not None:
        for l in lines[:,0,:]:
            x1,y1,x2,y2=l
            def dist_point_line(px,py,a,b):
                ax,ay=a; bx,by=b
                lab=math.hypot(bx-ax,by-ay)
                if lab==0: return math.hypot(px-ax,py-ay)
                t=max(0,min(1,((px-ax)*(bx-ax)+(py-ay)*(by-ay))/(lab*lab)))
                qx=ax+t*(bx-ax); qy=ay+t*(by-ay)
                return math.hypot(px-qx,py-qy)
            if dist_point_line(cx,cy,(x1,y1),(x2,y2))<r*0.08:
                radials.append((x1,y1,x2,y2))
    score=1.0+0.3*len(radials)
    return float(score),(cx,cy,r),radials

tmask=table_mask(gray)
tcnts,_=cv2.findContours(tmask,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
table_regions=[]
for c in tcnts:
    x,y,w,h=cv2.boundingRect(c)
    if w*h>5000 and w>80 and h>60:
        table_regions.append((x,y,x+w,y+h))

# text coverage mask
text_mask = np.zeros((H,W), dtype=np.uint8)
for (x0,y0,x1,y1) in vector.get('text_boxes', []):
    cv2.rectangle(text_mask,(x0,y0),(x1,y1),255,-1)

rect_hits=vector.get('rect_hits'); circ_hits=vector.get('circ_hits')
labels=[]
for (x0,y0,x1,y1) in regions:
    roi=page_bgr[y0:y1,x0:x1]
    # table overlap check
    is_table=False
    for (tx0,ty0,tx1,ty1) in table_regions:
        if not (x1<tx0 or tx1<x0 or y1<ty0 or ty1<y0):
            iw=min(x1,tx1)-max(x0,tx0); ih=min(y1,ty1)-max(y0,ty0)
            if iw>10 and ih>10: is_table=True; break
    if is_table:
        label='table'
    else:
        bs,_=bar_score(roi); ps,_,_=pie_score(roi)
        tr = text_mask[y0:y1,x0:x1].mean()/255.0 if (y1-y0)*(x1-x0)>0 else 0.0
        if tr>0.3:
            label='text'
        else:
            rsum=int(rect_hits[y0:y1,x0:x1].sum()) if rect_hits is not None else 0
            csum=int(circ_hits[y0:y1,x0:x1].sum()) if circ_hits is not None else 0
            if ps >= max(1.2, bs*1.3) or (csum>rsum and csum>1000):
                label='pie_chart'
            elif bs >= max(1.0, ps*1.2) or (rsum>csum and rsum>1000):
                label='bar_chart'
            else:
                label='image_other'
    labels.append({'x0':x0,'y0':y0,'x1':x1,'y1':y1,'w':x1-x0,'h':y1-y0,'label':label})

regions_df = pd.DataFrame(labels)
REGIONS_CSV = OUT_DIR / 'regions_5types.csv'
regions_df.to_csv(REGIONS_CSV, index=False)
REGIONS_CSV, regions_df['label'].value_counts().to_dict()

## 5) Extractors per region type

In [None]:
import os, csv, pandas as pd, numpy as np, cv2
from pathlib import Path
EXTRACT_DIR = OUT_DIR / 'extracts'
EXTRACT_DIR.mkdir(exist_ok=True)

# --- Utility: crop save ---
def save_crop(label, idx, x0,y0,x1,y1):
    crop = page_bgr[y0:y1, x0:x1]
    p = EXTRACT_DIR / f'{label}_{idx}.png'
    cv2.imwrite(p.as_posix(), crop)
    return p

# --- TEXT extraction --- vector preferred, else OCR ---
TEXT_DIR = EXTRACT_DIR / 'text'; TEXT_DIR.mkdir(exist_ok=True)
import importlib
if importlib.util.find_spec('pdfplumber') is not None:
    import pdfplumber
    with pdfplumber.open(PDF_PATH) as pdf:
        page = pdf.pages[PAGE_INDEX]
        full_text = page.extract_text(x_tolerance=1.5, y_tolerance=2) or ''
else:
    full_text = ''

def extract_text_region(idx, x0,y0,x1,y1):
    # Try vector sub-selection if we have words; else OCR on crop.
    txt=''
    if importlib.util.find_spec('pdfplumber') is not None:
        with pdfplumber.open(PDF_PATH) as pdf:
            page = pdf.pages[PAGE_INDEX]
            # convert pixel bbox to PDF pts by dividing scale
            sx,sy = 1/scale, 1/scale
            bbox_pts = (x0*sx, y0*sy, x1*sx, y1*sy)
            cropped = page.within_bbox(bbox_pts)
            txt = cropped.extract_text(x_tolerance=1.5, y_tolerance=2) or ''
    if not txt:
        if importlib.util.find_spec('pytesseract') is not None:
            import pytesseract
            crop = page_bgr[y0:y1, x0:x1]
            g = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
            g = cv2.threshold(g,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
            txt = pytesseract.image_to_string(g)
    fp = TEXT_DIR / f'text_{idx}.txt'
    with open(fp,'w',encoding='utf-8') as f:
        f.write(txt or '')
    return fp

# --- TABLE extraction --- Camelot preferred, fallback OCR grouping ---
TABLE_DIR = EXTRACT_DIR / 'tables'; TABLE_DIR.mkdir(exist_ok=True)
def extract_table_region(idx, x0,y0,x1,y1):
    # Try Camelot on full page first (indexed) and then crop OCR if not found.
    # For per-region exact table, OCR fallback is used due to library constraints on cropping.
    crop_p = save_crop('table', idx, x0,y0,x1,y1)
    out_csv = TABLE_DIR / f'table_{idx}.csv'
    rows=[]
    try:
        import pytesseract
        crop = cv2.imread(str(crop_p))
        g=cv2.cvtColor(crop,cv2.COLOR_BGR2GRAY)
        g=cv2.threshold(g,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
        data=pytesseract.image_to_data(g, output_type=pytesseract.Output.DICT)
        # naive row grouping by y
        line_bins={}
        for i,txt in enumerate(data['text']):
            t=txt.strip()
            if not t: continue
            yy=data['top'][i]
            key=round(yy/10)*10
            line_bins.setdefault(key,[]).append((data['left'][i], t))
        for _,cells in sorted(line_bins.items()):
            rows.append([t for _,t in sorted(cells)])
    except Exception as e:
        rows=[['(table OCR failed)']]
    with open(out_csv,'w',newline='',encoding='utf-8') as f:
        writer=csv.writer(f); writer.writerows(rows)
    return out_csv

# --- BAR chart extraction --- detect bars, infer y mapping from tick OCR when possible ---
BAR_DIR = EXTRACT_DIR / 'bar_charts'; BAR_DIR.mkdir(exist_ok=True)
def ocr_numeric_ticks(roi, side='left', margin=40):
    # Heuristic crop along left/right margin to find y-axis ticks
    h,w = roi.shape[:2]
    if side=='left': sub = roi[:, :min(margin, w)]
    else:            sub = roi[:, max(0,w-margin):]
    if importlib.util.find_spec('pytesseract') is None:
        return []
    import pytesseract
    g=cv2.cvtColor(sub,cv2.COLOR_BGR2GRAY)
    g=cv2.threshold(g,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    data=pytesseract.image_to_data(g, output_type=pytesseract.Output.DICT, config='--psm 6')
    ticks=[]
    for i,txt in enumerate(data['text']):
        s=txt.strip().replace(',','')
        if s.replace('.','',1).isdigit():
            y=data['top'][i]+data['height'][i]//2
            if side!='left':
                xoff = w-margin
            else:
                xoff = 0
            ticks.append({'y': y, 'val': float(s)})
    # dedupe by close y
    ticks=sorted(ticks, key=lambda d:d['y'])
    uniq=[]
    for t in ticks:
        if not uniq or abs(t['y']-uniq[-1]['y'])>6:
            uniq.append(t)
    return uniq

def bars_from_roi(roi):
    g=cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY)
    t=cv2.threshold(cv2.GaussianBlur(g,(3,3),0),0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    if (t==0).sum()>(t==255).sum(): t=cv2.bitwise_not(t)
    t=cv2.morphologyEx(t,cv2.MORPH_CLOSE,cv2.getStructuringElement(cv2.MORPH_RECT,(3,3)),iterations=1)
    cnts,_=cv2.findContours(t,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    bars=[]
    for c in cnts:
        x,y,w,h=cv2.boundingRect(c)
        if h/max(1.0,w)>1.3 and w*h>200 and h>20:
            bars.append((x,y,w,h))
    bars=sorted(bars,key=lambda b:b[0])
    return bars

def map_pixels_to_values(bars, ypix_baseline, ypix_top, yval_min, yval_max):
    span=max(1, ypix_baseline-ypix_top)
    out=[]
    for i,(x,y,w,h) in enumerate(bars, start=1):
        ratio=(ypix_baseline - y) / span
        val = yval_min + ratio * (yval_max - yval_min)
        out.append({'index':i,'x':x,'y_top':y,'w':w,'h_px':h,'value':val})
    return pd.DataFrame(out)

def extract_bar_region(idx, x0,y0,x1,y1):
    crop = page_bgr[y0:y1, x0:x1]
    bars = bars_from_roi(crop)
    # try OCR for y ticks on left then right
    ticks = ocr_numeric_ticks(crop,'left') or ocr_numeric_ticks(crop,'right')
    if len(ticks)>=2:
        # choose farthest apart
        t1=ticks[0]; t2=ticks[-1]
        ypix_baseline = max(t1['y'], t2['y'])
        ypix_top      = min(t1['y'], t2['y'])
        yval_min      = min(t1['val'], t2['val'])
        yval_max      = max(t1['val'], t2['val'])
        df = map_pixels_to_values(bars, ypix_baseline, ypix_top, yval_min, yval_max)
        mapping = {'source':'ocr_ticks','ypix_baseline':ypix_baseline,'ypix_top':ypix_top,'yval_min':yval_min,'yval_max':yval_max}
    else:
        # Fallback: normalized 0..1 based on pixel height
        if bars:
            ypix_baseline = max(y+h for (x,y,w,h) in bars)
            ypix_top = min(y for (x,y,w,h) in bars)
        else:
            ypix_baseline = (y1-y0)-5; ypix_top=5
        df = map_pixels_to_values(bars, ypix_baseline, ypix_top, 0.0, 1.0)
        mapping = {'source':'normalized','ypix_baseline':ypix_baseline,'ypix_top':ypix_top,'yval_min':0.0,'yval_max':1.0}
    csv_path = BAR_DIR / f'bar_{idx}.csv'
    df.to_csv(csv_path, index=False)
    # Try x labels OCR directly below baseline
    labels=[]
    try:
        import pytesseract
        baseline = mapping['ypix_baseline']
        lab_band = crop[min(crop.shape[0]-1, baseline+3): min(crop.shape[0], baseline+50), :]
        g=cv2.cvtColor(lab_band,cv2.COLOR_BGR2GRAY)
        g=cv2.threshold(g,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
        data=pytesseract.image_to_data(g, output_type=pytesseract.Output.DICT, config='--psm 6')
        # assign near bar x centers
        centers=[x+w//2 for (x,y,w,h) in bars]
        assigned=['']*len(centers)
        items=[]
        for i,txt in enumerate(data['text']):
            s=txt.strip()
            if not s: continue
            x=data['left'][i]; w=data['width'][i]
            cx=x+w//2
            items.append((cx,s))
        items.sort()
        for j,c in enumerate(centers):
            # closest item by |cx-c|
            if items:
                ii=min(range(len(items)), key=lambda k: abs(items[k][0]-c))
                assigned[j]=items[ii][1]
        df['label']=assigned
        df.to_csv(csv_path, index=False)
    except Exception:
        pass
    # Recreate bar chart
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5))
    xs=df['label'].fillna('').replace('', pd.Series(range(1,len(df)+1))).astype(str)
    plt.bar(xs, df['value'].astype(float).values)
    plt.ylabel('Value')
    plt.title(f'Bar chart {idx}')
    plt.tight_layout()
    png = BAR_DIR / f'bar_{idx}.png'
    plt.savefig(png.as_posix(), dpi=200)
    plt.show()
    return csv_path, png, mapping

# --- PIE chart extraction --- determine slice angles → percentages; try label OCR around circle ---
PIE_DIR = EXTRACT_DIR / 'pie_charts'; PIE_DIR.mkdir(exist_ok=True)
import math
def pie_geometry(roi):
    g=cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY)
    g=cv2.medianBlur(g,5)
    circles=cv2.HoughCircles(g,cv2.HOUGH_GRADIENT,dp=1.2,minDist=30,param1=120,param2=40,minRadius=20,maxRadius=0)
    if circles is None:
        return None
    import numpy as np
    c=max(np.uint16(np.around(circles))[0], key=lambda z:z[2])
    return int(c[0]), int(c[1]), int(c[2])

def pie_slices(roi, cx,cy,r):
    edges=cv2.Canny(cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY),60,180)
    lines=cv2.HoughLinesP(edges,1,np.pi/180,threshold=60,minLineLength=int(r*0.6),maxLineGap=10)
    angles=[]
    if lines is not None:
        for l in lines[:,0,:]:
            x1,y1,x2,y2 = l
            # keep lines that pass near center
            def dist_point_line(px,py,a,b):
                ax,ay=a; bx,by=b
                lab=math.hypot(bx-ax,by-ay)
                if lab==0: return math.hypot(px-ax,py-ay)
                t=max(0,min(1,((px-ax)*(bx-ax)+(py-ay)*(by-ay))/(lab*lab)))
                qx=ax+t*(bx-ax); qy=ay+t*(by-ay)
                return math.hypot(px-qx,py-qy)
            if dist_point_line(cx,cy,(x1,y1),(x2,y2)) < r*0.08:
                ang=math.degrees(math.atan2(y1-cy, x1-cx))
                angles.append(ang)
                ang=math.degrees(math.atan2(y2-cy, x2-cx))
                angles.append(ang)
    # normalize and sort
    angles=[(a+360)%360 for a in angles]
    angles=sorted(list(set(int(round(a)) for a in angles)))
    if len(angles)<3:
        return []
    # get consecutive differences (wraparound)
    diffs=[]
    for i in range(len(angles)):
        a1=angles[i]; a2=angles[(i+1)%len(angles)]
        d=(a2-a1) % 360
        diffs.append(d)
    total=sum(diffs)
    return [{'slice':i+1,'angle_deg':d,'pct':(d/total*100.0 if total else 0)} for i,d in enumerate(diffs)]

def extract_pie_region(idx, x0,y0,x1,y1):
    crop = page_bgr[y0:y1, x0:x1]
    geom = pie_geometry(crop)
    if geom is None:
        data = [{'slice':1,'angle_deg':360,'pct':100.0}]
    else:
        cx,cy,r = geom
        data = pie_slices(crop, cx,cy,r) or [{'slice':1,'angle_deg':360,'pct':100.0}]
    df = pd.DataFrame(data)
    csv_path = PIE_DIR / f'pie_{idx}.csv'
    df.to_csv(csv_path, index=False)
    # Recreate pie chart
    import matplotlib.pyplot as plt
    plt.figure(figsize=(5,5))
    plt.pie(df['pct'].astype(float).values, labels=df['slice'].astype(str).values, autopct='%1.1f%%')
    plt.title(f'Pie chart {idx}')
    plt.tight_layout()
    png = PIE_DIR / f'pie_{idx}.png'
    plt.savefig(png.as_posix(), dpi=200)
    plt.show()
    return csv_path, png

# --- IMAGE regions --- simply export crops ---
IMG_DIR = EXTRACT_DIR / 'images'; IMG_DIR.mkdir(exist_ok=True)
def extract_image_region(idx, x0,y0,x1,y1):
    return save_crop('image', idx, x0,y0,x1,y1)

# Main extraction loop
outputs = []
for i,row in regions_df.iterrows():
    x0,y0,x1,y1 = map(int, (row.x0,row.y0,row.x1,row.y1))
    label=row.label
    if label=='text':
        fp = extract_text_region(i+1, x0,y0,x1,y1)
        outputs.append({'type':'text','index':i+1,'path':str(fp)})
    elif label=='table':
        fp = extract_table_region(i+1, x0,y0,x1,y1)
        outputs.append({'type':'table','index':i+1,'path':str(fp)})
    elif label=='bar_chart':
        csvp, pngp, mapping = extract_bar_region(i+1, x0,y0,x1,y1)
        outputs.append({'type':'bar_chart','index':i+1,'csv':str(csvp),'png':str(pngp),'mapping':mapping})
    elif label=='pie_chart':
        csvp, pngp = extract_pie_region(i+1, x0,y0,x1,y1)
        outputs.append({'type':'pie_chart','index':i+1,'csv':str(csvp),'png':str(pngp)})
    else:
        p = extract_image_region(i+1, x0,y0,x1,y1)
        outputs.append({'type':'image_other','index':i+1,'path':str(p)})

OUT_JSON = OUT_DIR / 'extraction_summary.json'
import json
with open(OUT_JSON,'w',encoding='utf-8') as f:
    json.dump(outputs, f, ensure_ascii=False, indent=2)
OUT_JSON, len(outputs)

## 6) Annotated preview + crops

In [None]:
annot = page_bgr.copy()
colors = {
    'text': (255,0,0),        # BGR colors for OpenCV draw only
    'table': (0,255,255),
    'bar_chart': (0,255,0),
    'pie_chart': (0,165,255),
    'image_other': (0,0,255)
}
for _,r in regions_df.iterrows():
    x0,y0,x1,y1 = int(r.x0),int(r.y0),int(r.x1),int(r.y1)
    cv2.rectangle(annot,(x0,y0),(x1,y1),colors.get(r.label,(255,255,255)),2)
    cv2.putText(annot,r.label,(x0,max(0,y0-5)),cv2.FONT_HERSHEY_SIMPLEX,0.6,colors.get(r.label,(255,255,255)),2,cv2.LINE_AA)
PREVIEW = OUT_DIR / 'preview_5types.png'
cv2.imwrite(PREVIEW.as_posix(), annot)
PREVIEW

## 7) Optional: Build a PPTX with extracts (text snippet, first table, first bar/pie)

In [None]:
import importlib
if importlib.util.find_spec('pptx') is None:
    print('python-pptx not installed; skipping PPT build.')
else:
    from pptx import Presentation
    from pptx.util import Inches
    import pandas as pd, json
    prs = Presentation()
    # Title
    slide = prs.slides.add_slide(prs.slide_layouts[0])
    slide.shapes.title.text = 'PDF Extract & Recreate Summary'
    slide.placeholders[1].text = f'Page {PAGE_INDEX+1} — {PDF_PATH.name}'
    # Text slide (first)
    texts=[o for o in outputs if o['type']=='text']
    if texts:
        slide2 = prs.slides.add_slide(prs.slide_layouts[1])
        slide2.shapes.title.text = 'Text Extract (snippet)'
        with open(texts[0]['path'],'r',encoding='utf-8') as f:
            snip=f.read()[:2000]
        slide2.placeholders[1].text = snip
    # Table slide (first)
    tables=[o for o in outputs if o['type']=='table']
    if tables:
        import pandas as pd
        df = pd.read_csv(tables[0]['path'], header=None)
        slide3 = prs.slides.add_slide(prs.slide_layouts[6])
        rows, cols = df.shape
        table = slide3.shapes.add_table(rows+1, cols, Inches(0.5), Inches(1), Inches(9), Inches(5)).table
        for j in range(cols):
            table.cell(0,j).text = f'Col {j+1}'
        for i in range(rows):
            for j in range(cols):
                table.cell(i+1,j).text = str(df.iat[i,j])
    # Bar chart slide
    bars=[o for o in outputs if o['type']=='bar_chart']
    if bars:
        slide4 = prs.slides.add_slide(prs.slide_layouts[6])
        slide4.shapes.add_picture(bars[0]['png'], Inches(1), Inches(1), width=Inches(8))
    # Pie chart slide
    pies=[o for o in outputs if o['type']=='pie_chart']
    if pies:
        slide5 = prs.slides.add_slide(prs.slide_layouts[6])
        slide5.shapes.add_picture(pies[0]['png'], Inches(2.5), Inches(1.5), width=Inches(5))
    PPTX_PATH = OUT_DIR / 'extract_recreate_summary.pptx'
    prs.save(PPTX_PATH.as_posix())
    PPTX_PATH

## 8) Notes
- If bar tick OCR fails, the bar values will be **normalized 0–1**. You can set axis min/max manually by editing the `extract_bar_region` function.
- Pie slice detection uses radial line detection; for very stylized pies, consider legend-percentage OCR.
- Tables via Camelot are more accurate on vector PDFs; for scanned tables, OCR grouping is a fallback.
- Run one page at a time; to process multiple pages, loop over `PAGE_INDEX` and aggregate outputs.