# Document Ingestion with pdfplumber

This notebook explores document ingestion using the pdfplumber library.

In [None]:
# Install dependencies if needed
# !pip install pdfplumber

In [None]:
import pdfplumber
import os

In [None]:
# Set the PDF path
pdf_path = "../docs/DocLayNet.pdf"
print(f"Processing: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")

In [None]:
# Open PDF and extract text from first page
with pdfplumber.open(pdf_path) as pdf:
    first_page = pdf.pages[0]
    text = first_page.extract_text()
    print("Text from first page:")
    print(text[:500])

In [None]:
# Extract text from all pages
all_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            all_text += f"\n--- Page {i+1} ---\n{text}"

print(f"Total text length: {len(all_text)}")
print("First 1000 characters:")
print(all_text[:1000])

In [None]:
# Extract tables from first page
with pdfplumber.open(pdf_path) as pdf:
    first_page = pdf.pages[0]
    tables = first_page.extract_tables()
    print(f"Tables found on first page: {len(tables)}")
    if tables:
        for i, table in enumerate(tables):
            print(f"Table {i+1}:")
            for row in table[:5]:  # Show first 5 rows
                print(row)
            print()