<a href="https://colab.research.google.com/github/smypmsa/pdf-to-table/blob/main/OCR_ed_PDF_to_Excel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 PDF Table Extractor for Excel

## Overview
This notebook extracts high-quality tables from PDF files and formats them for Excel. Perfect for processing OCR-ed documents with structured data.

## How to Use
1. Click **"Install Dependencies"** button once
2. Click **"Extract Tables from PDF"** button for each file
3. Copy the displayed table and paste into Excel
4. Use **"Clear Workspace"** button between files

---

In [1]:
#@title 🔧 Install Dependencies (Run Once) { display-mode: "form" }
#@markdown Click the **Run** button to install required packages. This takes 1-2 minutes.

import sys
import subprocess

def install_packages():
    print("🔧 Installing required packages...")
    print("⏳ This may take 1-2 minutes...")

    # Install system dependencies
    subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
    subprocess.run(['apt-get', 'install', '-y', 'ghostscript', 'python3-tk', '-qq'], capture_output=True)

    # Install Python packages
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'camelot-py[cv]', 'tabula-py', 'pandas', 'openpyxl', '-q'], capture_output=True)

    # Set display options for better output
    import pandas as pd
    pd.set_option('display.max_rows', 20)
    pd.set_option('display.max_columns', 10)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 50)

    print("✅ Installation complete!")
    print("👇 Scroll down to extract tables from PDFs")

install_packages()


🔧 Installing required packages...
⏳ This may take 1-2 minutes...
✅ Installation complete!
👇 Scroll down to extract tables from PDFs


In [7]:
#@title 📊 Extract Tables from PDF { display-mode: "form" }
#@markdown Click **Run** and then **Choose Files** to upload your PDF and extract tables.

# Import libraries (hidden)
import camelot
import tabula
import pandas as pd
import io
from google.colab import files
import tempfile
import os

def extract_tables_camelot(file_path, method='lattice'):
    """Extract tables using Camelot"""
    try:
        tables = camelot.read_pdf(file_path, flavor=method, pages='all')
        if len(tables) == 0 and method == 'lattice':
            print("🔄 Trying stream method for borderless tables...")
            tables = camelot.read_pdf(file_path, flavor='stream', pages='all')
        return tables
    except Exception as e:
        print(f"❌ Camelot error: {str(e)}")
        return None

def extract_tables_tabula(file_path):
    """Backup extraction using Tabula"""
    try:
        tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)
        return tables
    except Exception as e:
        print(f"❌ Tabula error: {str(e)}")
        return None

def format_for_excel(tables, method='camelot'):
    """Format for Excel compatibility"""
    if not tables:
        return None

    all_data = []

    if method == 'camelot':
        for i, table in enumerate(tables):
            df = table.df
            if not df.empty:
                df_with_meta = df.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', table.page)
                df_with_meta.insert(2, 'Quality', f"{table.accuracy:.1f}%")
                all_data.append(df_with_meta)
    else:
        for i, table in enumerate(tables):
            if not table.empty:
                df_with_meta = table.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', 'Auto')
                df_with_meta.insert(2, 'Quality', 'N/A')
                all_data.append(df_with_meta)

    return pd.concat(all_data, ignore_index=True) if all_data else None

# Main processing
print("📁 Select your PDF file:")
uploaded = files.upload()

if uploaded:
    for filename, file_content in uploaded.items():
        print(f"\n🔄 Processing: {filename}")

        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(file_content)
            tmp_file_path = tmp_file.name

        try:
            # Extract tables
            print("🎯 Using Camelot for extraction...")
            camelot_tables = extract_tables_camelot(tmp_file_path, 'lattice')

            if camelot_tables and len(camelot_tables) > 0:
                print(f"✅ Found {len(camelot_tables)} tables with Camelot")
                for i, table in enumerate(camelot_tables):
                    print(f"   Table {i+1}: Page {table.page}, Quality: {table.accuracy:.1f}%")
                excel_df = format_for_excel(camelot_tables, 'camelot')
                method = 'Camelot'
            else:
                print("🔄 Trying Tabula...")
                tabula_tables = extract_tables_tabula(tmp_file_path)
                if tabula_tables and len(tabula_tables) > 0:
                    print(f"✅ Found {len(tabula_tables)} tables with Tabula")
                    excel_df = format_for_excel(tabula_tables, 'tabula')
                    method = 'Tabula'
                else:
                    excel_df = None
                    method = None

            # Create and download XLSX file
            if excel_df is not None and not excel_df.empty:
                table_count = excel_df['Table_#'].nunique()
                print(f"\n📈 Extracted {table_count} tables using {method}")

                # Create XLSX download
                base_name = filename.replace('.pdf', '')
                excel_filename = f"{base_name}_tables.xlsx"
                excel_df.to_excel(excel_filename, index=False)
                files.download(excel_filename)

                print(f"\n💾 Downloaded: {excel_filename}")
                print("✅ Processing complete!")

            else:
                print("❌ No tables found")
                print("💡 Ensure PDF contains structured tables")

        finally:
            os.unlink(tmp_file_path)
else:
    print("❌ No file uploaded")

📁 Select your PDF file:


Saving ExtractPage1 1_EN.pdf to ExtractPage1 1_EN.pdf
Saving ExtractPage1 1.pdf to ExtractPage1 1.pdf

🔄 Processing: ExtractPage1 1_EN.pdf
🎯 Using Camelot for extraction...
✅ Found 1 tables with Camelot
   Table 1: Page 1, Quality: 93.7%

📈 Extracted 1 tables using Camelot


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>




💾 Downloaded: ExtractPage1 1_EN_tables.xlsx
✅ Processing complete!

🔄 Processing: ExtractPage1 1.pdf
🎯 Using Camelot for extraction...
🔄 Trying stream method for borderless tables...
🔄 Trying Tabula...
❌ No tables found
💡 Ensure PDF contains structured tables


In [6]:
# @title 🗑️ Clear Input & Output Directory

import shutil
from pathlib import Path
import os

# Define the directories to be cleared
directories_to_clear = ["/content/sample_data"]

# Also clear uploaded files in /content (but preserve system folders)
content_dir = Path("/content")
system_folders = {".config", "sample_data", "__pycache__"}

# Warning message
print("⚠️ WARNING: This will delete all contents of the following directories:")
for directory in directories_to_clear:
    print(f"- {directory}")
print("- Uploaded files in /content (excluding system folders)")

#confirmation = input("Type 'YES' to confirm: ")

if True:
    # Clear specified directories
    for directory in directories_to_clear:
        dir_path = Path(directory)
        if dir_path.exists() and dir_path.is_dir():
            shutil.rmtree(dir_path)
            dir_path.mkdir(parents=True, exist_ok=True)
            print(f"✅ '{directory}' has been cleared.")
        else:
            print(f"The '{directory}' directory does not exist.")

    # Clear uploaded files from /content
    if content_dir.exists():
        for item in content_dir.iterdir():
            if item.name not in system_folders and item.is_file():
                item.unlink()
                print(f"✅ Removed uploaded file: {item.name}")
            elif item.is_dir() and item.name not in system_folders and item.name not in ["output", "sample_pdfs"]:
                shutil.rmtree(item)
                print(f"✅ Removed uploaded folder: {item.name}")

else:
    print("Operation cancelled. No files were deleted.")

- /content/sample_data
- Uploaded files in /content (excluding system folders)
✅ '/content/sample_data' has been cleared.
