# PDF text extraction

This notebook covers the experiments done for PDF text extraction

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
pdf_path = "../../data/cao-pdfs/Cao Bouw en Infra 2025 - 2027.pdf"

## MarkItDown

Link: https://github.com/microsoft/markitdown

In [None]:
from markitdown import MarkItDown

md = MarkItDown(enable_plugins=True) # Set to True to enable plugins
# Load the PDF file
result = md.convert(pdf_path)

print(result)

In [None]:
print(result.markdown)

## Docling

Link: https://github.com/docling-project/docling

In [None]:
from pathlib import Path

import pandas as pd

from docling.document_converter import DocumentConverter

converter = DocumentConverter()

In [None]:
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

# Explicitly set the accelerator
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.AUTO
# )
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.CPU
# )
# accelerator_options = AcceleratorOptions(
#     num_threads=8, device=AcceleratorDevice.MPS
# )
accelerator_options = AcceleratorOptions(
    num_threads=8, device=AcceleratorDevice.CUDA
)

pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

In [None]:
# Convert the document
conversion_result = converter.convert(pdf_path)

In [None]:
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = conversion_result.input.file.stem

In [None]:
print(f"Document has {len(conversion_result.document.pages)} pages and {len(conversion_result.document.tables)} tables.")
print(f"Document text content:\n{conversion_result.document.export_to_markdown()}...")

In [None]:
# Export tables
for table_ix, table in enumerate(conversion_result.document.tables):
    table_df: pd.DataFrame = table.export_to_dataframe(doc=conversion_result.document)
    print(f"## Table {table_ix}")
    print(table_df.to_markdown())

    # Save the table as CSV
    element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
    table_df.to_csv(element_csv_filename)

## PDFPlumber

Link: https://github.com/jsvine/pdfplumber

In [None]:
import pdfplumber
print(pdfplumber.__version__)

pdf = pdfplumber.open(pdf_path)

In [None]:
page = pdf.pages[7]
im = page.to_image()
im

In [None]:
text = page.extract_text()
print(text)

In [None]:
table = page.extract_tables()
print(table)

## PyMUPDF

Link: https://github.com/pymupdf/PyMuPDF

In [None]:
import pymupdf # imports the pymupdf library
doc = pymupdf.open(pdf_path) # open a document
#for page in doc: # iterate the document pages
page = doc[7] # get page 7 (0-based)

In [None]:
text = page.get_text() # get plain text encoded as UTF-8
print(text)

In [None]:
tables = page.find_tables()
table = tables[0].to_pandas()
print(table.to_markdown())

## Azure Document Intelligence

Link: https://github.com/Azure-Samples/document-intelligence-code-samples

In [1]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat

In [2]:
def analyze_layout_from_url():
    # Set your endpoint and key from environment variables
    # For how to set them, see PREREQUISITES above.
    endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
    key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    file = open(pdf_path, "rb")

    # convert to bytes
    pdf_bytes = file.read()

    # Analyze a sample document layout    
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", 
        AnalyzeDocumentRequest(bytes_source=pdf_bytes),
    )
    result = poller.result()

    return result

In [3]:
def print_layout_analysis(result):
    # Analyze styles (e.g., whether the document contains handwritten content)
    # if result.styles:
    #     for idx, style in enumerate(result.styles):
    #         print(
    #             "Document contains {} content".format(
    #                 "handwritten" if style.is_handwritten else "no handwritten"
    #             )
    #         )

    # Analyze pages
    # for page in result.pages:
    #     print(f"----Analyzing layout from page #{page.page_number}----")

    #     # Analyze lines
    #     if page.lines:
    #         for line_idx, line in enumerate(page.lines):
    #             print(
    #                 f"...Line #{line_idx} has text content '{line.content}'"
    #             )

    # Analyze tables
    if result.tables:
        print(f"----Analyzing {len(result.tables)} tables found in the document----")
        for table_idx, table in enumerate(result.tables[4:]):
            print(
                f"Table #{table_idx} has {table.row_count} rows and {table.column_count} columns"
            )
            for cell in table.cells:
                print(
                    f"...Cell[{cell.row_index}][{cell.column_index}] has content '{cell.content}'"
                )

    print("----------------------------------------")

In [6]:
from azure.core.exceptions import HttpResponseError
from dotenv import find_dotenv, load_dotenv

try:
    load_dotenv(find_dotenv())
    result = analyze_layout_from_url()
except HttpResponseError as error:
    # Examples of how to check an HttpResponseError
    if error.error is not None:
        if error.error.code == "InvalidImage":
            print(f"Received an invalid image error: {error.error}")
        elif error.error.code == "InvalidRequest":
            print(f"Received an invalid request error: {error.error}")
        raise
    if "Invalid request".casefold() in error.message.casefold():
        print(f"Uh-oh! Seems there was an invalid request: {error}")
    raise

In [8]:
for table in result.tables[4:5]:
    print(f"----Analyzing table with {table.row_count} rows and {table.column_count} columns----")
    for cell in table.cells:
        # print the table like a markdown table
        print(f"| {cell.content} ", end="")
        if cell.column_index == table.column_count - 1:
            print("|")

----Analyzing table with 8 rows and 4 columns----
| duur arbeidsovereenkomst | specificatie | maximale proeftijd |  |  | bouwplaats | uta |
| onbepaalde tijd | n.v.t. | 2 maanden | 2 maanden |
| bepaalde tijd | 6 maanden of korter | geen proeftijd | geen proeftijd |
|  | langer dan 6 maanden, korter dan 1 jaar | 2 weken | 1 maand |
|  | 1 tot 2 jaar | 1 maand | 1 maand |
|  | 2 jaar of langer | 2 maanden | 2 maanden |
|  | niet eindigend op een specifieke datum* |  | 1 maand |


In [None]:
try:
    print_layout_analysis(result)
except HttpResponseError as error:
    # Examples of how to check an HttpResponseError
    if error.error is not None:
        if error.error.code == "InvalidImage":
            print(f"Received an invalid image error: {error.error}")
        elif error.error.code == "InvalidRequest":
            print(f"Received an invalid request error: {error.error}")
        raise
    if "Invalid request".casefold() in error.message.casefold():
        print(f"Uh-oh! Seems there was an invalid request: {error}")
    raise

# Azure Content understanding

Link: https://github.com/Azure-Samples/data-extraction-using-azure-content-understanding


In [2]:
from helpers.azure_content_understanding_client import AzureContentUnderstandingClient

In [3]:
# Initialize Azure Content Understanding client
endpoint = os.getenv("AZURE_CONTENT_UNDERSTANDING_ENDPOINT")
subscription_key = os.getenv("AZURE_CONTENT_UNDERSTANDING_KEY")

if not endpoint or not subscription_key:
    raise ValueError("Azure Content Understanding endpoint and subscription key must be set in environment variables.")

client = AzureContentUnderstandingClient(
    endpoint=endpoint,
    subscription_key=subscription_key
)

In [4]:
# List available analyzers
analyzers = client.get_all_analyzers()
print("Available analyzers:")
for analyzer in analyzers.get('value', []):
    print(f"- {analyzer['analyzerId']}: {analyzer.get('description', 'No description')}")

Available analyzers:
- prebuilt-audioAnalyzer: Transcribe conversations and extract summaries.
- prebuilt-callCenter: Analyze call center conversations to extract transcripts, summaries, sentiment, and more.
- prebuilt-contract: Extract contract document fields.
- prebuilt-documentAnalyzer: Extract various content and layout elements such as words, paragraphs, and tables from documents.
- prebuilt-imageAnalyzer: Analyze images to generate descriptions.
- prebuilt-invoice: Extract invoice document fields.
- prebuilt-videoAnalyzer: Analyze videos to extract transcript and description for each segment.


In [5]:
# Analyze the PDF using Azure Content Understanding
# Note: Replace 'prebuilt-documentAnalyzer' with an actual analyzer ID from the list above
analyzer_id = "prebuilt-documentAnalyzer"

response = client.begin_analyze_file(analyzer_id, pdf_path)
analysis_result = client.poll_result(response)

print(f"Analysis completed. Status: {analysis_result.get('status', 'Unknown')}")


Analysis completed. Status: Succeeded


In [6]:

# Extract and display results similar to Docling
if 'result' in analysis_result and 'contents' in analysis_result['result']:
    contents = analysis_result['result']['contents']
    print(f"Document has {len(contents)} content sections.")
    
    for i, content in enumerate(contents):
        if 'markdown' in content:
            print(f"Content section {i+1} markdown:\n{content['markdown']}...")
        
        if 'fields' in content:
            print(f"Extracted fields in section {i+1}:")
            for field_name, field_data in content['fields'].items():
                if 'valueString' in field_data:
                    print(f"- {field_name}: {field_data['valueString']} (confidence: {field_data.get('confidence', 'N/A')})")
                elif 'valueNumber' in field_data:
                    print(f"- {field_name}: {field_data['valueNumber']} (confidence: {field_data.get('confidence', 'N/A')})")
else:
    print("No contents found in the analysis result.")

Document has 1 content sections.
Content section 1 markdown:
# cao Bouw & Infra 2025 - 2027

<!-- PageFooter="cao Bouw & Infra 2025 - 2027, 19 juni 2025" -->
<!-- PageNumber="1" -->
<!-- PageBreak -->


## Inhoudsopgave


<table>
<tr>
<td colspan="2">1. In dienst - uit dienst</td>
<td>5</td>
</tr>
<tr>
<td>1.1</td>
<td>Intredekeuring</td>
<td>5</td>
</tr>
<tr>
<td colspan="2">1.2 Functie-indeling</td>
<td>6</td>
</tr>
<tr>
<td>1.3</td>
<td>Arbeidsovereenkomst</td>
<td>7</td>
</tr>
<tr>
<td>1.4</td>
<td>Tijdens de arbeidsovereenkomst: bouwplaatswerknemer</td>
<td>9</td>
</tr>
<tr>
<td>1.5</td>
<td>Tijdens de arbeidsovereenkomst: uta-werknemer</td>
<td>10</td>
</tr>
<tr>
<td colspan="2">1.6 Beëindiging arbeidsovereenkomst</td>
<td>11</td>
</tr>
<tr>
<td colspan="2">1.7 Afrekenen bij beëindiging arbeidsovereenkomst</td>
<td>12</td>
</tr>
<tr>
<td colspan="2">2. Arbeidsduur en werktijden</td>
<td>13</td>
</tr>
<tr>
<td>2.1</td>
<td>Werktijdregelingen</td>
<td>13</td>
</tr>
<tr>
<td>2.2</td

In [11]:
import pandas as pd

# Print tables extracted by Azure Content Understanding
if 'result' in analysis_result and 'contents' in analysis_result['result']:
    contents = analysis_result['result']['contents']
    print(f"Document has {len(contents)} content sections.")
    
    # Print tables extracted by Azure Content Understanding
    if 'result' in analysis_result and 'contents' in analysis_result['result']:
        contents = analysis_result['result']['contents']
        print(f"Document has {len(contents)} content sections.")
        
        for i, content in enumerate(contents):
            if 'tables' in content:
                tables = content['tables']
                print(f"Content section {i+1} has {len(tables)} tables.")
                for j, table in enumerate(tables):
                    print(f"Table {j+1}: {table['rowCount']} rows x {table['columnCount']} columns")
                    # Create a 2D list for the table
                    table_data = [['' for _ in range(table['columnCount'])] for _ in range(table['rowCount'])]
                    cells = table.get('cells') or []
                    for cell in cells:
                        row = cell['rowIndex']
                        col = cell['columnIndex']
                        row_span = cell.get('rowSpan', 1)
                        col_span = cell.get('columnSpan', 1)
                        content = cell['content']
                        for r in range(row, min(row + row_span, table['rowCount'])):
                            for c in range(col, min(col + col_span, table['columnCount'])):
                                table_data[r][c] = content
                    # Print as markdown table
                    df = pd.DataFrame(table_data)
                    print(df.to_markdown(index=False, headers=[]))
                                    
                    
    

Document has 1 content sections.
Document has 1 content sections.
Content section 1 has 65 tables.
Table 1: 46 rows x 3 columns
|:--------------------------------------------------|:--------------------------------------------------------|---:|
| 1. In dienst - uit dienst                         | 1. In dienst - uit dienst                               |  5 |
| 1.1                                               | Intredekeuring                                          |  5 |
| 1.2 Functie-indeling                              | 1.2 Functie-indeling                                    |  6 |
| 1.3                                               | Arbeidsovereenkomst                                     |  7 |
| 1.4                                               | Tijdens de arbeidsovereenkomst: bouwplaatswerknemer     |  9 |
| 1.5                                               | Tijdens de arbeidsovereenkomst: uta-werknemer           | 10 |
| 1.6 Beëindiging arbeidsovereenkomst               |