<a href="https://colab.research.google.com/github/alexfazio/pypdf_table_extraction/blob/main/examples/camelot_quick_start_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies Overview

- [Camelot](https://github.com/camelot-dev/camelot)
- [pypdf](https://github.com/py-pdf/pypdf) 
- [GhostScript](https://www.ghostscript.com/index.html)

**Usage:** Either upload files or provide a PDF URL in the specified cells.

In [2]:
# @title 🛠️ Install Requirements
!pip install pypdf
!apt-get install -y ghostscript
!pip install "camelot-py[cv]"

Collecting pypdf
  Using cached pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.3.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[3

In [3]:
# @title 📂 Create necessary directories and delete `sample_data` if exists

import os
import shutil
from pathlib import Path

# Function to delete a directory and its contents
def delete_directory(path):
    try:
        shutil.rmtree(path)
        print(f"Deleted directory: {path}")
    except FileNotFoundError:
        print(f"Directory not found: {path}")
    except Exception as e:
        print(f"Error deleting directory {path}: {e}")

# Delete /content/sample_data if it exists
sample_data_dir = Path('/content/sample_data')
if sample_data_dir.exists():
    print("Deleting /content/sample_data directory...")
    delete_directory(sample_data_dir)

# Create the necessary directories
os.makedirs('/content/output', exist_ok=True)
os.makedirs('/content/sample_pdfs', exist_ok=True)

# Define input and output directories
input_dir = Path('/content/sample_pdfs')
output_dir = Path('/content/output')

print("Directories set up complete.")
print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

PermissionError: [Errno 13] Permission denied: '/content'

In [None]:
# @title 📤 Upload Files (Optional)

from google.colab import files

print("\nPlease upload your PDF files. They will be saved in /content/sample_pdfs")

# Upload files
uploaded = files.upload()

# Move uploaded files to /content/sample_pdfs and remove from /content
for filename in uploaded.keys():
    src_path = Path('/content') / filename
    dst_path = Path('/content/sample_pdfs') / filename
    if src_path.exists():
        shutil.move(str(src_path), str(dst_path))
        print(f"Moved {filename} to /content/sample_pdfs")
    else:
        print(f"Warning: {filename} not found in /content")

# Clean up any remaining PDF files in /content
for file in Path('/content').glob('*.pdf'):
    os.remove(file)
    print(f"Removed {file.name} from /content")

print("\nUpload and organization complete. Files are now only in /content/sample_pdfs")

# Verify contents of /content/sample_pdfs
print("\nContents of /content/sample_pdfs:")
print(os.listdir('/content/sample_pdfs'))

# Verify no PDF files in /content
print("\nChecking for PDF files in /content:")
content_pdfs = list(Path('/content').glob('*.pdf'))
if content_pdfs:
    print("Warning: Found these PDF files in /content:")
    for pdf in content_pdfs:
        print(f" - {pdf.name}")
else:
    print("No PDF files found in /content")

In [None]:
# @title ⬇📕 Download Sample .PDF Document (Optional)

import os
import requests
from pathlib import Path

def convert_github_url_to_raw(url):
    if "github.com" in url and "/blob/" in url:
        raw_url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
        return raw_url
    else:
        return "Invalid GitHub URL"

# Sample .pdf data from GitHub
pdf_url = 'https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf' # @param {type:"string"}

# Convert the GitHub URL to the raw content URL
pdf_url = convert_github_url_to_raw(pdf_url)

# Check if the URL is valid
if pdf_url == "Invalid GitHub URL":
    raise ValueError("The provided GitHub URL is invalid.")

# Create the /content/sample_pdfs directory if it doesn't exist
sample_pdfs_dir = Path('/content/sample_pdfs')
sample_pdfs_dir.mkdir(parents=True, exist_ok=True)

# Download the PDF
response = requests.get(pdf_url)
response.raise_for_status()  # Check if the request was successful

# Extract the filename from the URL
filename = os.path.basename(pdf_url)

# Specify the file path in the /content/sample_pdfs directory
pdf_file_path = sample_pdfs_dir / filename

# Save the file, overwriting if it already exists
with open(pdf_file_path, 'wb') as file:
    file.write(response.content)

print(f"PDF file downloaded and saved to: {pdf_file_path}")

In [None]:
# @title ⚙️ Core - Simple Tables (Strict Parameters)

import camelot
import logging
from pathlib import Path

# Set up logging
logging.getLogger("camelot").setLevel(logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def process_pdf(pdf_file, output_dir):
    print(f"Processing {pdf_file.name}")
    logging.info(f"Processing {pdf_file.name}")
    # Read tables from the PDF
    tables = camelot.read_pdf(str(pdf_file))
    if len(tables) == 0:
        print(f"No tables detected in {pdf_file.name}")
        logging.warning(f"No tables detected in {pdf_file.name}")
        return
    # Create a subdirectory for this PDF's output
    pdf_output_dir = output_dir / pdf_file.stem
    pdf_output_dir.mkdir(exist_ok=True)
    # Export all tables to CSV (without compression)
    tables.export(str(pdf_output_dir / f"{pdf_file.stem}.csv"), f='csv')
    # Process individual tables
    for i, table in enumerate(tables):
        # Save individual table to CSV
        table.to_csv(str(pdf_output_dir / f"{pdf_file.stem}_table_{i+1}.csv"))
        # Log parsing report for each table
        print(f"Table {i+1} Parsing Report:")
        logging.info(f"Table {i+1} Parsing Report:")
        print(table.parsing_report)
        logging.info(table.parsing_report)

# Define input_dir and output_dir
input_dir = Path('/content/sample_pdfs')
output_dir = Path('/content/output')

print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

# Ensure output directory exists
output_dir.mkdir(exist_ok=True)

# Process each PDF in the input directory
pdf_files = list(input_dir.glob('*.pdf'))
print(f"Found {len(pdf_files)} PDF files")

if len(pdf_files) == 0:
    print("No PDF files found in the input directory.")
    logging.warning("No PDF files found in the input directory.")
else:
    for pdf_file in pdf_files:
        process_pdf(pdf_file, output_dir)

    print("Processing complete. Check the 'output' folder for results.")
    logging.info("Processing complete. Check the 'output' folder for results.")

print("Script execution finished.")

In [None]:
# @title ⚙️ Core - Complex Tables (Loose Parameters)

import camelot
import os
from pathlib import Path

# Create output directory if it doesn't exist
output_dir = Path('/content/output')
output_dir.mkdir(parents=True, exist_ok=True)

# Process all PDF files in the input directory
input_dir = Path('/content/sample_pdfs')
for pdf_file in input_dir.glob('*.pdf'):
    print(f"Processing {pdf_file.name}")

    # Using 'stream' flavor with table_areas
    tables_stream = camelot.read_pdf(str(pdf_file), flavor='stream', table_areas=['50,750,500,50'])

    if len(tables_stream) == 0:
        # If no tables are detected, try using 'lattice' flavor
        tables_lattice = camelot.read_pdf(str(pdf_file), flavor='lattice', table_areas=['50,750,500,50'])

    # Checking the detected tables
    if len(tables_stream) > 0:
        tables = tables_stream
    elif len(tables_lattice) > 0:
        tables = tables_lattice
    else:
        tables = []

    # Exporting if tables are found
    if len(tables) > 0:
        output_base = output_dir / pdf_file.stem
        tables.export(f'{output_base}.csv', f='csv', compress=True)  # export all tables to CSV
        tables[0].to_csv(f'{output_base}_first_table.csv')  # Save the first table to CSV
        df = tables[0].df  # Get the first table as a pandas DataFrame
        print(f"Tables found in {pdf_file.name}:")
        print(df)
    else:
        print(f"No tables found in {pdf_file.name}")

print("Processing complete. Check the output directory for results.")

In [None]:
# @title 🗑️ Clear Input & Output Directory

import shutil
from pathlib import Path
import os

# Define the directories to be cleared
directories_to_clear = ['/content/output', '/content/sample_pdfs']

# Warning message
print("⚠️ WARNING: This will delete all contents of the following directories:")
for directory in directories_to_clear:
    print(f"- {directory}")

confirmation = input("Type 'YES' to confirm: ")

if confirmation == 'YES':
    for directory in directories_to_clear:
        dir_path = Path(directory)
        if dir_path.exists() and dir_path.is_dir():
            # Remove all contents of the directory
            for item in dir_path.iterdir():
                if item.is_dir():
                    shutil.rmtree(item)
                else:
                    item.unlink()
            print(f"✅ All contents of '{directory}' have been deleted.")
        else:
            print(f"The '{directory}' directory does not exist.")
else:
    print("Operation cancelled. No files were deleted.")
