# PDF text extraction

This notebook covers the experiments done for PDF text extraction

In [None]:
pdf_path = "../../data/cao-pdfs/Cao Bouw en Infra 2025 - 2027.pdf"

## MarkItDown

Link: https://github.com/microsoft/markitdown

In [None]:
from markitdown import MarkItDown

md = MarkItDown(enable_plugins=True) # Set to True to enable plugins
# Load the PDF file
result = md.convert(pdf_path)

print(result)

In [None]:
print(result.markdown)

## Docling

Link: https://github.com/docling-project/docling

In [None]:
from pathlib import Path

import pandas as pd

from docling.document_converter import DocumentConverter

converter = DocumentConverter()

In [None]:
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.AUTO
# )
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.CPU
# )
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.MPS
# )
accelerator_options = AcceleratorOptions(
    num_threads=8, device=AcceleratorDevice.CUDA
)

pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

In [None]:
# Convert the document
conversion_result = converter.convert(pdf_path)

In [None]:
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = conversion_result.input.file.stem

In [None]:
print(f"Document has {len(conversion_result.document.pages)} pages and {len(conversion_result.document.tables)} tables.")
print(f"Document text content:\n{conversion_result.document.export_to_markdown()}...")

In [None]:
# Export tables
for table_ix, table in enumerate(conversion_result.document.tables):
    table_df: pd.DataFrame = table.export_to_dataframe(doc=conversion_result.document)
    print(f"## Table {table_ix}")
    print(table_df.to_markdown())

    # Save the table as CSV
    element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
    table_df.to_csv(element_csv_filename)

## PDFPlumber

Link: https://github.com/jsvine/pdfplumber

In [None]:
import pdfplumber
print(pdfplumber.__version__)

pdf = pdfplumber.open(pdf_path)

In [None]:
page = pdf.pages[7]
im = page.to_image()
im

In [None]:
text = page.extract_text()
print(text)

In [None]:
table = page.extract_tables()
print(table)

## PyMUPDF

Link: https://github.com/pymupdf/PyMuPDF

In [None]:
import pymupdf # imports the pymupdf library
doc = pymupdf.open(pdf_path) # open a document
#for page in doc: # iterate the document pages
page = doc[7] # get page 7 (0-based)

In [None]:
text = page.get_text() # get plain text encoded as UTF-8
print(text)

In [None]:
tables = page.find_tables()
table = tables[0].to_pandas()
print(table.to_markdown())

## Azure Document Intelligence

Link: https://github.com/Azure-Samples/document-intelligence-code-samples

In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat

In [None]:
def analyze_layout_from_url():
    # Set your endpoint and key from environment variables
    # For how to set them, see PREREQUISITES above.
    endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
    key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    file = open(pdf_path, "rb")

    # convert to bytes
    pdf_bytes = file.read()

    # Analyze a sample document layout    
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", 
        AnalyzeDocumentRequest(bytes_source=pdf_bytes),
    )
    result = poller.result()

    return result

In [None]:
def print_layout_analysis(result):
    # Analyze styles (e.g., whether the document contains handwritten content)
    # if result.styles:
    #     for idx, style in enumerate(result.styles):
    #         print(
    #             "Document contains {} content".format(
    #                 "handwritten" if style.is_handwritten else "no handwritten"
    #             )
    #         )

    # Analyze pages
    # for page in result.pages:
    #     print(f"----Analyzing layout from page #{page.page_number}----")

    #     # Analyze lines
    #     if page.lines:
    #         for line_idx, line in enumerate(page.lines):
    #             print(
    #                 f"...Line #{line_idx} has text content '{line.content}'"
    #             )

    # Analyze tables
    if result.tables:
        print(f"----Analyzing {len(result.tables)} tables found in the document----")
        for table_idx, table in enumerate(result.tables[4:]):
            print(
                f"Table #{table_idx} has {table.row_count} rows and {table.column_count} columns"
            )
            for cell in table.cells:
                print(
                    f"...Cell[{cell.row_index}][{cell.column_index}] has content '{cell.content}'"
                )

    print("----------------------------------------")

In [None]:
from azure.core.exceptions import HttpResponseError
from dotenv import find_dotenv, load_dotenv

try:
    load_dotenv(find_dotenv())
    result = analyze_layout_from_url()
except HttpResponseError as error:
    # Examples of how to check an HttpResponseError
    if error.error is not None:
        if error.error.code == "InvalidImage":
            print(f"Received an invalid image error: {error.error}")
        elif error.error.code == "InvalidRequest":
            print(f"Received an invalid request error: {error.error}")
        raise
    if "Invalid request".casefold() in error.message.casefold():
        print(f"Uh-oh! Seems there was an invalid request: {error}")
    raise

In [None]:
for table in result.tables[5:6]:
    print(f"----Analyzing table with {table.row_count} rows and {table.column_count} columns----")
    for cell in table.cells:
        # print the table like a markdown table
        print(f"| {cell.content} ", end="")
        if cell.column_index == table.column_count - 1:
            print("|")

In [None]:
try:
    print_layout_analysis(result)
except HttpResponseError as error:
    # Examples of how to check an HttpResponseError
    if error.error is not None:
        if error.error.code == "InvalidImage":
            print(f"Received an invalid image error: {error.error}")
        elif error.error.code == "InvalidRequest":
            print(f"Received an invalid request error: {error.error}")
        raise
    if "Invalid request".casefold() in error.message.casefold():
        print(f"Uh-oh! Seems there was an invalid request: {error}")
    raise