<a href="https://colab.research.google.com/github/smypmsa/pdf-to-table/blob/main/OCR_ed_PDF_to_Excel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 PDF Table Extractor for Excel

## Overview
This notebook extracts high-quality tables from PDF files and formats them for Excel. Perfect for processing OCR-ed documents with structured data.

## How to Use
1. Click **"Install Dependencies"** button once
2. Click **"Extract Tables from PDF"** button for each file
3. Copy the displayed table and paste into Excel
4. Use **"Clear Workspace"** button between files

---

In [5]:
#@title 🔧 Install Dependencies (Run Once) { display-mode: "form" }
#@markdown Click the **Run** button to install required packages. This takes 2-3 minutes.

import sys
import subprocess

def install_packages():
    print("🔧 Installing required packages...")
    print("⏳ This may take 2-3 minutes...")

    # Install system dependencies
    subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
    subprocess.run(['apt-get', 'install', '-y', 'ghostscript', 'python3-tk', 'poppler-utils', '-qq'], capture_output=True)

    # Install Python packages
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'camelot-py[cv]', 'tabula-py', 'pandas', 'openpyxl', 'easyocr', 'pdf2image', 'Pillow', 'opencv-python', '-q'], capture_output=True)

    # Set display options for better output
    import pandas as pd
    pd.set_option('display.max_rows', 20)
    pd.set_option('display.max_columns', 10)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 50)

    print("✅ Installation complete!")
    print("👇 Scroll down to extract tables from PDFs")

install_packages()

🔧 Installing required packages...
⏳ This may take 1-2 minutes...
✅ Installation complete!
👇 Scroll down to extract tables from PDFs


In [6]:
#@title 📊 Extract Tables from PDF { display-mode: "form" }
#@markdown Click **Run** and then **Choose Files** to upload your PDF and extract tables.

# Import libraries (hidden)
import camelot
import tabula
import pandas as pd
import io
from google.colab import files
import tempfile
import os

def extract_tables_camelot(file_path, method='lattice'):
    """Extract tables using Camelot"""
    try:
        tables = camelot.read_pdf(file_path, flavor=method, pages='all')
        if len(tables) == 0 and method == 'lattice':
            print("🔄 Trying stream method for borderless tables...")
            tables = camelot.read_pdf(file_path, flavor='stream', pages='all')
        return tables
    except Exception as e:
        print(f"❌ Camelot error: {str(e)}")
        return None

def extract_tables_tabula(file_path):
    """Backup extraction using Tabula"""
    try:
        tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)
        return tables
    except Exception as e:
        print(f"❌ Tabula error: {str(e)}")
        return None

def format_for_excel(tables, method='camelot'):
    """Format for Excel compatibility"""
    if not tables:
        return None

    all_data = []

    if method == 'camelot':
        for i, table in enumerate(tables):
            df = table.df
            if not df.empty:
                df_with_meta = df.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', table.page)
                df_with_meta.insert(2, 'Quality', f"{table.accuracy:.1f}%")
                all_data.append(df_with_meta)
    else:
        for i, table in enumerate(tables):
            if not table.empty:
                df_with_meta = table.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', 'Auto')
                df_with_meta.insert(2, 'Quality', 'N/A')
                all_data.append(df_with_meta)

    return pd.concat(all_data, ignore_index=True) if all_data else None

# Main processing
print("📁 Select your PDF file:")
uploaded = files.upload()

if uploaded:
    for filename, file_content in uploaded.items():
        print(f"\n🔄 Processing: {filename}")

        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(file_content)
            tmp_file_path = tmp_file.name

        try:
            # Extract tables
            print("🎯 Using Camelot for extraction...")
            camelot_tables = extract_tables_camelot(tmp_file_path, 'lattice')

            if camelot_tables and len(camelot_tables) > 0:
                print(f"✅ Found {len(camelot_tables)} tables with Camelot")
                for i, table in enumerate(camelot_tables):
                    print(f"   Table {i+1}: Page {table.page}, Quality: {table.accuracy:.1f}%")
                excel_df = format_for_excel(camelot_tables, 'camelot')
                method = 'Camelot'
            else:
                print("🔄 Trying Tabula...")
                tabula_tables = extract_tables_tabula(tmp_file_path)
                if tabula_tables and len(tabula_tables) > 0:
                    print(f"✅ Found {len(tabula_tables)} tables with Tabula")
                    excel_df = format_for_excel(tabula_tables, 'tabula')
                    method = 'Tabula'
                else:
                    excel_df = None
                    method = None

            # Create and download XLSX file
            if excel_df is not None and not excel_df.empty:
                table_count = excel_df['Table_#'].nunique()
                print(f"\n📈 Extracted {table_count} tables using {method}")

                # Create XLSX download
                base_name = filename.replace('.pdf', '')
                excel_filename = f"{base_name}_tables.xlsx"
                excel_df.to_excel(excel_filename, index=False)
                files.download(excel_filename)

                print(f"\n💾 Downloaded: {excel_filename}")
                print("✅ Processing complete!")

            else:
                print("❌ No tables found")
                print("💡 Ensure PDF contains structured tables")

        finally:
            os.unlink(tmp_file_path)
else:
    print("❌ No file uploaded")

📁 Select your PDF file:


Saving Sample A_EN.pdf to Sample A_EN.pdf
Saving Sample B_EN.pdf to Sample B_EN.pdf
Saving Sample C_EN.pdf to Sample C_EN.pdf
Saving Sample D_EN.pdf to Sample D_EN.pdf

🔄 Processing: Sample A_EN.pdf
🎯 Using Camelot for extraction...
✅ Found 9 tables with Camelot
   Table 1: Page 1, Quality: 95.0%
   Table 2: Page 1, Quality: 93.3%
   Table 3: Page 2, Quality: 95.4%
   Table 4: Page 2, Quality: 93.0%
   Table 5: Page 3, Quality: 95.6%
   Table 6: Page 4, Quality: 95.8%
   Table 7: Page 5, Quality: 94.1%
   Table 8: Page 6, Quality: 94.8%
   Table 9: Page 7, Quality: 94.9%

📈 Extracted 9 tables using Camelot


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Downloaded: Sample A_EN_tables.xlsx
✅ Processing complete!

🔄 Processing: Sample B_EN.pdf
🎯 Using Camelot for extraction...
✅ Found 2 tables with Camelot
   Table 1: Page 1, Quality: 96.2%
   Table 2: Page 1, Quality: 67.9%

📈 Extracted 2 tables using Camelot


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Downloaded: Sample B_EN_tables.xlsx
✅ Processing complete!

🔄 Processing: Sample C_EN.pdf
🎯 Using Camelot for extraction...
✅ Found 1 tables with Camelot
   Table 1: Page 1, Quality: 95.5%

📈 Extracted 1 tables using Camelot


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Downloaded: Sample C_EN_tables.xlsx
✅ Processing complete!

🔄 Processing: Sample D_EN.pdf
🎯 Using Camelot for extraction...
✅ Found 1 tables with Camelot
   Table 1: Page 1, Quality: 91.0%

📈 Extracted 1 tables using Camelot


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Downloaded: Sample D_EN_tables.xlsx
✅ Processing complete!


In [None]:
#@title 📊 Extract Tables from PDF (with OCR) { display-mode: "form" }
#@markdown Click **Run** and then **Choose Files** to upload your PDF and extract tables.

# Import libraries (hidden)
import camelot
import tabula
import pandas as pd
import io
from google.colab import files
import tempfile
import os
import easyocr
from pdf2image import convert_from_path
from PIL import Image
import cv2
import numpy as np
import re
from typing import List, Dict, Any

# Initialize OCR reader (this will download models on first use)
print("🔍 Initializing OCR engine...")
ocr_reader = easyocr.Reader(['en'])  # Add more languages as needed: ['en', 'es', 'fr', etc.]
print("✅ OCR ready!")

def is_image_based_pdf(file_path):
    """Check if PDF is image-based by trying to extract text"""
    try:
        # Try to extract tables with camelot first
        test_tables = camelot.read_pdf(file_path, pages='1', flavor='lattice')
        if len(test_tables) > 0 and not test_tables[0].df.empty:
            # Check if extracted content is meaningful
            df = test_tables[0].df
            text_content = ' '.join(df.values.flatten().astype(str))
            if len(text_content.strip()) > 50:  # Reasonable text threshold
                return False

        # Try stream method
        test_tables = camelot.read_pdf(file_path, pages='1', flavor='stream')
        if len(test_tables) > 0 and not test_tables[0].df.empty:
            df = test_tables[0].df
            text_content = ' '.join(df.values.flatten().astype(str))
            if len(text_content.strip()) > 50:
                return False

        return True  # Likely image-based
    except Exception as e:
        print(f"⚠️ Error checking PDF type: {str(e)}")
        return True  # Default to OCR if uncertain

def extract_tables_camelot(file_path, method='lattice'):
    """Extract tables using Camelot"""
    try:
        tables = camelot.read_pdf(file_path, flavor=method, pages='all')
        if len(tables) == 0 and method == 'lattice':
            print("🔄 Trying stream method for borderless tables...")
            tables = camelot.read_pdf(file_path, flavor='stream', pages='all')
        return tables
    except Exception as e:
        print(f"❌ Camelot error: {str(e)}")
        return None

def extract_tables_tabula(file_path):
    """Backup extraction using Tabula"""
    try:
        tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)
        return tables
    except Exception as e:
        print(f"❌ Tabula error: {str(e)}")
        return None

def detect_table_structure(text_blocks):
    """Detect table structure from OCR text blocks"""
    # Group text blocks by y-coordinate (rows)
    rows = {}
    for block in text_blocks:
        bbox, text, confidence = block
        if confidence > 0.5:  # Filter low-confidence text
            y_center = (bbox[0][1] + bbox[2][1]) / 2
            # Group by approximate row (within 10 pixels)
            row_key = round(y_center / 10) * 10
            if row_key not in rows:
                rows[row_key] = []
            rows[row_key].append({
                'text': text.strip(),
                'x': bbox[0][0],
                'confidence': confidence
            })

    # Sort rows by y-coordinate and sort cells within each row by x-coordinate
    sorted_rows = []
    for y in sorted(rows.keys()):
        row_cells = sorted(rows[y], key=lambda x: x['x'])
        sorted_rows.append([cell['text'] for cell in row_cells if cell['text']])

    return [row for row in sorted_rows if row]  # Remove empty rows

def extract_tables_ocr(file_path):
    """Extract tables using OCR"""
    try:
        print("🔍 Converting PDF to images...")
        images = convert_from_path(file_path, dpi=300)

        all_tables = []

        for page_num, image in enumerate(images, 1):
            print(f"📄 Processing page {page_num} with OCR...")

            # Convert PIL image to OpenCV format
            img_array = np.array(image)

            # Perform OCR
            results = ocr_reader.readtext(img_array)

            if results:
                # Extract table structure
                table_data = detect_table_structure(results)

                if table_data and len(table_data) > 1:  # At least 2 rows for a table
                    # Create DataFrame
                    max_cols = max(len(row) for row in table_data)

                    # Pad rows to have same number of columns
                    padded_data = []
                    for row in table_data:
                        padded_row = row + [''] * (max_cols - len(row))
                        padded_data.append(padded_row)

                    df = pd.DataFrame(padded_data)

                    # Create table-like object similar to camelot
                    table_obj = type('Table', (), {
                        'df': df,
                        'page': page_num,
                        'accuracy': 85.0  # Estimated accuracy for OCR
                    })

                    all_tables.append(table_obj)
                    print(f"   ✅ Found table with {len(table_data)} rows, {max_cols} columns")
                else:
                    print(f"   ⚠️ No structured table found on page {page_num}")
            else:
                print(f"   ⚠️ No text detected on page {page_num}")

        return all_tables

    except Exception as e:
        print(f"❌ OCR error: {str(e)}")
        return None

def format_for_excel(tables, method='camelot'):
    """Format for Excel compatibility"""
    if not tables:
        return None

    all_data = []

    if method in ['camelot', 'ocr']:
        for i, table in enumerate(tables):
            df = table.df
            if not df.empty:
                df_with_meta = df.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', table.page)
                df_with_meta.insert(2, 'Quality', f"{table.accuracy:.1f}%")
                df_with_meta.insert(3, 'Method', method.upper())
                all_data.append(df_with_meta)
    else:
        for i, table in enumerate(tables):
            if not table.empty:
                df_with_meta = table.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', 'Auto')
                df_with_meta.insert(2, 'Quality', 'N/A')
                df_with_meta.insert(3, 'Method', method.upper())
                all_data.append(df_with_meta)

    return pd.concat(all_data, ignore_index=True) if all_data else None

# Main processing
print("📁 Select your PDF file:")
uploaded = files.upload()

if uploaded:
    for filename, file_content in uploaded.items():
        print(f"\n🔄 Processing: {filename}")

        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(file_content)
            tmp_file_path = tmp_file.name

        try:
            # Check if PDF is image-based
            print("🔍 Analyzing PDF type...")
            is_image_pdf = is_image_based_pdf(tmp_file_path)

            if is_image_pdf:
                print("📷 Detected image-based PDF - using OCR")
                tables = extract_tables_ocr(tmp_file_path)
                method = 'OCR'

                if tables and len(tables) > 0:
                    print(f"✅ Found {len(tables)} tables with OCR")
                    for i, table in enumerate(tables):
                        print(f"   Table {i+1}: Page {table.page}, Quality: {table.accuracy:.1f}%")
                    excel_df = format_for_excel(tables, 'ocr')
                else:
                    excel_df = None
            else:
                print("📄 Detected text-based PDF - using standard extraction")
                # Extract tables using standard methods
                print("🎯 Using Camelot for extraction...")
                camelot_tables = extract_tables_camelot(tmp_file_path, 'lattice')

                if camelot_tables and len(camelot_tables) > 0:
                    print(f"✅ Found {len(camelot_tables)} tables with Camelot")
                    for i, table in enumerate(camelot_tables):
                        print(f"   Table {i+1}: Page {table.page}, Quality: {table.accuracy:.1f}%")
                    excel_df = format_for_excel(camelot_tables, 'camelot')
                    method = 'Camelot'
                else:
                    print("🔄 Trying Tabula...")
                    tabula_tables = extract_tables_tabula(tmp_file_path)
                    if tabula_tables and len(tabula_tables) > 0:
                        print(f"✅ Found {len(tabula_tables)} tables with Tabula")
                        excel_df = format_for_excel(tabula_tables, 'tabula')
                        method = 'Tabula'
                    else:
                        print("🔄 Fallback to OCR for text-based PDF...")
                        tables = extract_tables_ocr(tmp_file_path)
                        if tables and len(tables) > 0:
                            excel_df = format_for_excel(tables, 'ocr')
                            method = 'OCR (Fallback)'
                        else:
                            excel_df = None
                            method = None

            # Create and download XLSX file
            if excel_df is not None and not excel_df.empty:
                table_count = excel_df['Table_#'].nunique()
                print(f"\n📈 Extracted {table_count} tables using {method}")

                # Create XLSX download
                base_name = filename.replace('.pdf', '')
                excel_filename = f"{base_name}_tables.xlsx"
                excel_df.to_excel(excel_filename, index=False)
                files.download(excel_filename)

                print(f"\n💾 Downloaded: {excel_filename}")
                print("✅ Processing complete!")

            else:
                print("❌ No tables found")
                print("💡 Tips:")
                print("   - Ensure PDF contains structured tables")
                print("   - Try with higher quality scans for image-based PDFs")
                print("   - Tables should have clear row/column structure")

        finally:
            os.unlink(tmp_file_path)
else:
    print("❌ No file uploaded")

In [None]:
# @title 🗑️ Clear Input & Output Directory

import shutil
from pathlib import Path
import os

# Define the directories to be cleared
directories_to_clear = ["/content/sample_data"]

# Also clear uploaded files in /content (but preserve system folders)
content_dir = Path("/content")
system_folders = {".config", "sample_data", "__pycache__"}

# Warning message
print("⚠️ WARNING: This will delete all contents of the following directories:")
for directory in directories_to_clear:
    print(f"- {directory}")
print("- Uploaded files in /content (excluding system folders)")

#confirmation = input("Type 'YES' to confirm: ")

if True:
    # Clear specified directories
    for directory in directories_to_clear:
        dir_path = Path(directory)
        if dir_path.exists() and dir_path.is_dir():
            shutil.rmtree(dir_path)
            dir_path.mkdir(parents=True, exist_ok=True)
            print(f"✅ '{directory}' has been cleared.")
        else:
            print(f"The '{directory}' directory does not exist.")

    # Clear uploaded files from /content
    if content_dir.exists():
        for item in content_dir.iterdir():
            if item.name not in system_folders and item.is_file():
                item.unlink()
                print(f"✅ Removed uploaded file: {item.name}")
            elif item.is_dir() and item.name not in system_folders and item.name not in ["output", "sample_pdfs"]:
                shutil.rmtree(item)
                print(f"✅ Removed uploaded folder: {item.name}")

else:
    print("Operation cancelled. No files were deleted.")

- /content/sample_data
- Uploaded files in /content (excluding system folders)
✅ '/content/sample_data' has been cleared.
