# Contract scanning

This notebook contains experiments to scan contracts for content integrity check.

In [2]:
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
pdf_path = "../../data/contracts/Contract291936Van_Gobbel.pdf"

# Docling library

Link: https://docling-project.github.io/docling/

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")

if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}: {torch.cuda.device_count()} GPU(s)")
else:
    print("CUDA is not available. Using CPU.")
    print(f"{torch.cpu.device_count()} CPU core(s) available")

In [None]:
from pathlib import Path
from docling.document_converter import DocumentConverter

converter = DocumentConverter() 

In [None]:
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

# Explicitly set the accelerator options
accelerator_options = AcceleratorOptions(
    num_threads=8, device=AcceleratorDevice.CUDA
)

pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

In [None]:
# Convert the document
conversion_result = converter.convert(pdf_path)

In [None]:
output_dir = Path("outputs/01-contract-scanning")
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = conversion_result.input.file.stem

In [None]:
print(f"Document has {len(conversion_result.document.pages)} pages and {len(conversion_result.document.tables)} tables.")
print(f"Document text content:\n{conversion_result.document.export_to_markdown()}...")

# Azure Content understanding

Link: https://github.com/Azure-Samples/data-extraction-using-azure-content-understanding


In [None]:
from helpers.azure_content_understanding_client import AzureContentUnderstandingClient

In [7]:
# Initialize Azure Content Understanding client
endpoint = os.getenv("AZURE_CONTENT_UNDERSTANDING_ENDPOINT")
subscription_key = os.getenv("AZURE_CONTENT_UNDERSTANDING_KEY")

if not endpoint or not subscription_key:
    raise ValueError("Azure Content Understanding endpoint and subscription key must be set in environment variables.")

client = AzureContentUnderstandingClient(
    endpoint=endpoint,
    subscription_key=subscription_key
)

In [8]:
# List available analyzers
analyzers = client.get_all_analyzers()
print("Available analyzers:")
for analyzer in analyzers.get('value', []):
    print(f"- {analyzer['analyzerId']}: {analyzer.get('description', 'No description')}")

Available analyzers:
- prebuilt-audioAnalyzer: Transcribe conversations and extract summaries.
- prebuilt-callCenter: Analyze call center conversations to extract transcripts, summaries, sentiment, and more.
- prebuilt-contract: Extract contract document fields.
- prebuilt-documentAnalyzer: Extract various content and layout elements such as words, paragraphs, and tables from documents.
- prebuilt-imageAnalyzer: Analyze images to generate descriptions.
- prebuilt-invoice: Extract invoice document fields.
- prebuilt-videoAnalyzer: Analyze videos to extract transcript and description for each segment.


In [None]:
# Analyze the PDF using Azure Content Understanding
# Note: Replace 'prebuilt-documentAnalyzer' with an actual analyzer ID from the list above
analyzer_id = "prebuilt-documentAnalyzer"

response = client.begin_analyze_file(analyzer_id, pdf_path)
analysis_result = client.poll_result(response)

print(f"Analysis completed. Status: {analysis_result.get('status', 'Unknown')}")


Analysis completed. Status: Succeeded
Document has 1 content sections.
Content section 1 markdown:
<figure>

pay for people

</figure>


<!-- PageHeader="a brisker company" -->
<!-- PageHeader="Payrollovereenkomst (contractnr. 291936)" -->

De ondergetekenden, te weten:

\- Ranjith Premalal Van Gobbel (wonende te SS 5, asd, geboren d.d. 2-5-05), hierna te noemen werknemer,
en

\- Pay for People flex, Parklaan 28, Rotterdam, vertegenwoordigd door Dennis Luyten, hierna te noemen
werkgever

overwegende dat

Werknemer door inlener is geworven en geselecteerd voor een opdracht bij inlener. Werknemer bereid is met
werkgever een payrollovereenkomst in de zin van artikel 7:692 BW aan te gaan op grond waarvan
werknemer exclusief ter beschikking wordt gesteld aan inlener om arbeid te verrichten onder leiding en
toezicht van de inlener.

komen het volgende overeen:


# 1 Dienstverband

Werknemer treedt voor bepaalde tijd met ingang van 25-1-25 in dienst van werkgever en wordt door
werkgever ter b

In [None]:

# Extract and display results similar to Docling
if 'result' in analysis_result and 'contents' in analysis_result['result']:
    contents = analysis_result['result']['contents']
    print(f"Document has {len(contents)} content sections.")
    
    for i, content in enumerate(contents):
        if 'markdown' in content:
            print(f"Content section {i+1} markdown:\n{content['markdown']}...")
        
        if 'fields' in content:
            print(f"Extracted fields in section {i+1}:")
            for field_name, field_data in content['fields'].items():
                if 'valueString' in field_data:
                    print(f"- {field_name}: {field_data['valueString']} (confidence: {field_data.get('confidence', 'N/A')})")
                elif 'valueNumber' in field_data:
                    print(f"- {field_name}: {field_data['valueNumber']} (confidence: {field_data.get('confidence', 'N/A')})")
else:
    print("No contents found in the analysis result.")

In [13]:
import pandas as pd

# Print tables extracted by Azure Content Understanding
if 'result' in analysis_result and 'contents' in analysis_result['result']:
    contents = analysis_result['result']['contents']
    print(f"Document has {len(contents)} content sections.")
    
    # Print tables extracted by Azure Content Understanding
    if 'result' in analysis_result and 'contents' in analysis_result['result']:
        contents = analysis_result['result']['contents']
        print(f"Document has {len(contents)} content sections.")
        
        for i, content in enumerate(contents):
            if 'tables' in content:
                tables = content['tables']
                print(f"Content section {i+1} has {len(tables)} tables.")
                for j, table in enumerate(tables):
                    print(f"Table {j+1}: {table['rowCount']} rows x {table['columnCount']} columns")
                    # Create a 2D list for the table
                    table_data = [['' for _ in range(table['columnCount'])] for _ in range(table['rowCount'])]
                    cells = table.get('cells') or []
                    for cell in cells:
                        row = cell['rowIndex']
                        col = cell['columnIndex']
                        row_span = cell.get('rowSpan', 1)
                        col_span = cell.get('columnSpan', 1)
                        content = cell['content']
                        for r in range(row, min(row + row_span, table['rowCount'])):
                            for c in range(col, min(col + col_span, table['columnCount'])):
                                table_data[r][c] = content
                    # Print as markdown table
                    df = pd.DataFrame(table_data)
                    print(df.to_markdown(index=False, headers=[]))
                    
    

Document has 1 content sections.
Document has 1 content sections.
Content section 1 has 1 tables.
Table 1: 2 rows x 2 columns
|:----------------------------|:--------------|
| werknemer,                  | werkgever,    |
| Ranjith Premalal Van Gobbel | Dennis Luyten |
