In [1]:
import os
import json
import uuid
import pdfplumber
import camelot
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image


In [3]:
def save_table_image(pdf_path, page_number, bbox, out_path):
    """
    bbox = (x0, top, x1, bottom) in pdfplumber coords
    """
    doc = fitz.open(pdf_path)
    page = doc[page_number - 1]

    # Convert to PyMuPDF coords
    rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])
    pix = page.get_pixmap(clip=rect, dpi=200)

    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img.save(out_path)


In [4]:
def normalize_table_for_embedding(df, title=None):
    lines = []
    if title:
        lines.append(f"Table Title: {title}")

    lines.append("Columns:")
    for col in df.columns:
        lines.append(f"- {col}")

    lines.append("\nRows:")
    for idx, row in df.iterrows():
        row_text = ", ".join([str(v) for v in row if str(v).strip()])
        if row_text:
            lines.append(f"Row {idx + 1}: {row_text}")

    return "\n".join(lines)


In [5]:
def table_confidence_score(df):
    total_cells = df.shape[0] * df.shape[1]
    empty_cells = (df == "").sum().sum()

    fill_ratio = 1 - (empty_cells / max(total_cells, 1))

    if fill_ratio > 0.85:
        return 0.9
    elif fill_ratio > 0.6:
        return 0.75
    elif fill_ratio > 0.4:
        return 0.5
    else:
        return 0.3


In [6]:
def extract_tables_from_pdf(pdf_path, output_dir="tables_output"):
    os.makedirs(output_dir, exist_ok=True)

    results = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_index, page in enumerate(pdf.pages, start=1):

            detected_tables = page.find_tables()

            for t_index, table in enumerate(detected_tables):
                table_id = f"table_{page_index}_{t_index}_{uuid.uuid4().hex[:6]}"
                bbox = table.bbox

                df = None
                extraction_method = None

                # 1️⃣ Try Camelot (stream)
                try:
                    camelot_tables = camelot.read_pdf(
                        pdf_path,
                        pages=str(page_index),
                        flavor="stream",
                        table_areas=[",".join(map(str, bbox))]
                    )
                    if camelot_tables and not camelot_tables[0].df.empty:
                        df = camelot_tables[0].df
                        extraction_method = "camelot_stream"
                except:
                    pass

                # 2️⃣ Fallback to pdfplumber
                if df is None:
                    try:
                        extracted = table.extract()
                        df = pd.DataFrame(extracted[1:], columns=extracted[0])
                        extraction_method = "pdfplumber"
                    except:
                        continue

                # Clean dataframe
                df = df.fillna("").astype(str)

                confidence = table_confidence_score(df)

                # Save table image
                img_path = os.path.join(output_dir, f"{table_id}.png")
                save_table_image(pdf_path, page_index, bbox, img_path)

                # Normalize text
                normalized_text = normalize_table_for_embedding(df)

                # Save JSON
                table_json = {
                    "table_id": table_id,
                    "page": page_index,
                    "bbox": bbox,
                    "rows": df.shape[0],
                    "columns": df.shape[1],
                    "extraction_method": extraction_method,
                    "confidence": confidence,
                    "needs_review": confidence < 0.6,
                    "table_data": df.to_dict(orient="records"),
                    "embedding_text": normalized_text,
                    "image_path": img_path
                }

                json_path = os.path.join(output_dir, f"{table_id}.json")
                with open(json_path, "w", encoding="utf-8") as f:
                    json.dump(table_json, f, indent=2)

                results.append(table_json)

    return results


In [7]:
pdf_path = "data/research_paper/ocr_llm.pdf"

tables = extract_tables_from_pdf(pdf_path)

print(f"Extracted {len(tables)} tables")


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  "table_data": df.to_dict(orient="records"),
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


Extracted 6 tables
