# Docling tests



## Imports

In [None]:
from pathlib import Path

import os
import dotenv
import json
import openai
import random
import litellm
from litellm import completion
from tqdm.notebook import tqdm # Import tqdm for notebooks

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from joblib import Parallel, delayed
import time

from joblib import Parallel, delayed

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

from logging import getLogger
logger = getLogger(__name__)
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    level=logging.INFO,
                    datefmt='%Y-%m-%d %H:%M:%S',
                    )



## Input documents

In [None]:
data_folder = Path("..") / "data/Newspapers/"
subdirs = [p for p in data_folder.iterdir() if p.is_dir()]

seed = 42
random.seed(seed)
random_pdfs = []
pdf_paths = []

for subdir in subdirs:
    pdfs = list(subdir.glob("*.pdf"))
    if pdfs:
        chosen = random.choice(pdfs)
        random_pdfs.append(chosen)
    for pdf_path in pdfs:
        pdf_paths.append(pdf_path)

logger.info(f"Found {len(pdf_paths)} to process")
# Comment out to process random pdfs
# pdf_paths = random_pdfs

## Processing

### Docling


In [None]:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
# ocr_options = RapidOcrOptions(force_full_page_ocr=True)

ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options
pipeline_options.ocr_options.lang = ["ell"]

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

converter_str = "docling"
markdown_docling_paths = []
json_docling_paths = []


def process_pdf(pdf_path):
    markdown_path = pdf_path.parent / (pdf_path.stem + f".{converter_str}.md")
    json_path = pdf_path.parent / (pdf_path.stem + f".{converter_str}.json")
    if markdown_path.exists() and json_path.exists():
        return json_path, markdown_path

    doc = converter.convert(pdf_path).document

    json_result = doc.export_to_dict()
    md = doc.export_to_markdown()

    with open(markdown_path, "w") as outf:
        outf.write(md)
    with open(json_path, "w") as outf:
        outf.write(json.dumps(json_result, ensure_ascii=False))
    return json_path, markdown_path


In [None]:
# Non-joblib (sequential) version
results = []
for pdf_path in tqdm(pdf_paths, desc="Processing PDFs"):
    print(f"Processing: {pdf_path.parent / (pdf_path.name)}")
    res = process_pdf(pdf_path)
    print(f"Exported to: {pdf_path.parent / (pdf_path.stem + f'.{converter_str}.md')}")
    results.append(res)

# Unpack results
json_docling_paths = []
markdown_docling_paths = []
for res in results:
    if res:
        json_docling_paths.append(res[0])
        markdown_docling_paths.append(res[1])



In [None]:
# A joblib version to experiment with

# Set the number of parallel jobs (adjust as needed)
# n_jobs = 4

# results = Parallel(n_jobs=n_jobs)(
#     delayed(lambda pdf_path: (
#         print(f"Processing: {pdf_path.parent / (pdf_path.name)}"),
#         res := process_pdf(pdf_path),
#         print(f"Exported to: {pdf_path.parent / (pdf_path.stem + f'.{converter_str}.md')}"),
#         res
#     )[-1])(pdf_path) for pdf_path in tqdm(pdf_paths, desc="Processing PDFs")
# )

# # Unpack results
# json_docling_paths = []
# markdown_docling_paths = []
# for res in results:
#     if res:
#         json_docling_paths.append(res[0])
#         markdown_docling_paths.append(res[1])

