<a href="https://colab.research.google.com/github/smypmsa/pdf-to-table/blob/main/OCR_ed_PDF_to_Excel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 PDF Table Extractor for Excel

## Overview
This notebook extracts high-quality tables from PDF files and formats them for Excel. Perfect for processing OCR-ed documents with structured data.

## How to Use
1. Click **"Install Dependencies"** button once
2. Click **"Extract Tables from PDF"** button for each file
3. Copy the displayed table and paste into Excel
4. Use **"Clear Workspace"** button between files

---

In [None]:
#@title 🔧 Install Dependencies (Run Once) { display-mode: "form" }
#@markdown Click the **Run** button to install required packages. This takes 1-2 minutes.

import sys
import subprocess

def install_packages():
    print("🔧 Installing required packages...")
    print("⏳ This may take 1-2 minutes...")

    # Install system dependencies
    subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
    subprocess.run(['apt-get', 'install', '-y', 'ghostscript', 'python3-tk', '-qq'], capture_output=True)

    # Install Python packages
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'camelot-py[cv]', 'tabula-py', 'pandas', 'openpyxl', '-q'], capture_output=True)

    # Set display options for better output
    import pandas as pd
    pd.set_option('display.max_rows', 20)
    pd.set_option('display.max_columns', 10)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 50)

    print("✅ Installation complete!")
    print("👇 Scroll down to extract tables from PDFs")

install_packages()


In [None]:
#@title 📊 Extract Tables from PDF { display-mode: "form" }
#@markdown Click **Run** and then **Choose Files** to upload your PDF and extract tables.

# Import libraries (hidden)
import camelot
import tabula
import pandas as pd
import io
from google.colab import files
from IPython.display import display, HTML
import tempfile
import os

# Set output display limits
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_colwidth', 40)

def extract_tables_camelot(file_path, method='lattice'):
    """Extract tables using Camelot"""
    try:
        tables = camelot.read_pdf(file_path, flavor=method, pages='all')
        if len(tables) == 0 and method == 'lattice':
            print("🔄 Trying stream method for borderless tables...")
            tables = camelot.read_pdf(file_path, flavor='stream', pages='all')
        return tables
    except Exception as e:
        print(f"❌ Camelot error: {str(e)}")
        return None

def extract_tables_tabula(file_path):
    """Backup extraction using Tabula"""
    try:
        tables = tabula.read_pdf(file_path, pages='all', multiple_tables=True)
        return tables
    except Exception as e:
        print(f"❌ Tabula error: {str(e)}")
        return None

def format_for_excel(tables, method='camelot'):
    """Format for Excel compatibility"""
    if not tables:
        return None

    all_data = []

    if method == 'camelot':
        for i, table in enumerate(tables):
            df = table.df
            if not df.empty:
                df_with_meta = df.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', table.page)
                df_with_meta.insert(2, 'Quality', f"{table.accuracy:.1f}%")
                all_data.append(df_with_meta)
    else:
        for i, table in enumerate(tables):
            if not table.empty:
                df_with_meta = table.copy()
                df_with_meta.insert(0, 'Table_#', i + 1)
                df_with_meta.insert(1, 'Page', 'Auto')
                df_with_meta.insert(2, 'Quality', 'N/A')
                all_data.append(df_with_meta)

    return pd.concat(all_data, ignore_index=True) if all_data else None

# Main processing
print("📁 Select your PDF file:")
uploaded = files.upload()

if uploaded:
    for filename, file_content in uploaded.items():
        print(f"\n🔄 Processing: {filename}")

        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(file_content)
            tmp_file_path = tmp_file.name

        try:
            # Extract tables
            print("🎯 Using Camelot for extraction...")
            camelot_tables = extract_tables_camelot(tmp_file_path, 'lattice')

            if camelot_tables and len(camelot_tables) > 0:
                print(f"✅ Found {len(camelot_tables)} tables with Camelot")
                for i, table in enumerate(camelot_tables):
                    print(f"   Table {i+1}: Page {table.page}, Quality: {table.accuracy:.1f}%")
                excel_df = format_for_excel(camelot_tables, 'camelot')
                method = 'Camelot'
            else:
                print("🔄 Trying Tabula...")
                tabula_tables = extract_tables_tabula(tmp_file_path)
                if tabula_tables and len(tabula_tables) > 0:
                    print(f"✅ Found {len(tabula_tables)} tables with Tabula")
                    excel_df = format_for_excel(tabula_tables, 'tabula')
                    method = 'Tabula'
                else:
                    excel_df = None
                    method = None

            # Display results
            if excel_df is not None and not excel_df.empty:
                table_count = excel_df['Table_#'].nunique()
                print(f"\n📈 Extracted {table_count} tables using {method}")

                print("\n📋 Results (scroll to see all data):")
                print("-" * 60)

                # Display with scrolling
                with pd.option_context('display.max_rows', None):
                    display(HTML(f"""
                    <div style="height: 400px; overflow-y: scroll; border: 1px solid #ddd; padding: 10px;">
                        {excel_df.to_html(index=False, classes='table table-striped')}
                    </div>
                    """))

                # Create downloads
                base_name = filename.replace('.pdf', '')
                csv_filename = f"{base_name}_tables.csv"
                excel_filename = f"{base_name}_tables.xlsx"

                excel_df.to_csv(csv_filename, index=False)
                excel_df.to_excel(excel_filename, index=False)

                files.download(csv_filename)
                files.download(excel_filename)

                print(f"\n💾 Downloaded: {csv_filename} & {excel_filename}")
                print("\n📝 To copy to Excel:")
                print("   1. Select table data above")
                print("   2. Copy (Ctrl+C)")
                print("   3. Paste in Excel (Ctrl+V)")

            else:
                print("❌ No tables found")
                print("💡 Ensure PDF contains structured tables")

        finally:
            os.unlink(tmp_file_path)
else:
    print("❌ No file uploaded")

In [None]:
#@title 🧹 Clear Workspace { display-mode: "form" }
#@markdown Click **Run** to clear all uploaded files and outputs for a fresh start.

import os
import gc
from IPython.display import clear_output

# Clear variables and memory
variables_to_clear = ['uploaded', 'excel_df', 'camelot_tables', 'tabula_tables', 'file_content', 'tmp_file_path', 'filename', 'base_name', 'method']
cleared_count = 0

for var in variables_to_clear:
    if var in globals():
        del globals()[var]
        cleared_count += 1

# Check for temp files
temp_files = [f for f in os.listdir('.') if f.endswith('_tables.csv') or f.endswith('_tables.xlsx')]

# Force garbage collection
gc.collect()

# Clear outputs and show status
clear_output(wait=True)

print("🧹 Workspace Cleared!")
print("=" * 25)
print(f"✅ Cleared {cleared_count} variables")
print("✅ Memory freed up")
print("✅ Outputs cleared")
if temp_files:
    print(f"ℹ️  {len(temp_files)} temp files in workspace")
print("\n🎯 Ready for new PDF files!")

---
## 📝 Usage Instructions

### Step 1: Install (Once)
- Click the **"Install Dependencies"** button above
- Wait for "Installation complete!" message

### Step 2: Extract Tables
- Click **"Extract Tables from PDF"** button
- Upload your PDF file when prompted
- View results in scrollable table
- Download CSV/Excel files automatically

### Step 3: Copy to Excel
- Select the displayed table data
- Copy (Ctrl+C) and paste into Excel (Ctrl+V)
- Structure is preserved automatically

### Step 4: Clear Between Files
- Click **"Clear Workspace"** button
- Removes old data and outputs
- Ready for next PDF file

---