In [70]:
import os
import gc
import json
import fitz
import time
import multiprocessing  # Changed back to multiprocessing
from pathlib import Path
from concurrent.futures import as_completed, TimeoutError
#from pdfplucker.utils import format_result, link_subtitles, get_safe_executor, logger, Data
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling_core.types.doc import ImageRefMode
from docling.exceptions import ConversionError
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    EasyOcrOptions,
)
from docling_core.types.doc import (
    PictureItem,
    TableItem,
    TextItem,
    DocItemLabel,
)

In [76]:
def create_converter(device : str = 'CPU', num_threads : int = 4, ocr_lang: list = ['es', 'pt'], force_ocr: bool = False) -> DocumentConverter:
    ''' Create a DocumentConverter object with the pipeline options configured''' 
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True 
    pipeline_options.ocr_options.lang = ocr_lang
    pipeline_options.generate_picture_images = True
    pipeline_options.do_picture_classification = True
    pipeline_options.do_formula_enrichment = True
    #note yet pipeline_options.do_picture_description = True
    
    # Aggressive scaling for low memory mode
    pipeline_options.images_scale = 1
    
    if force_ocr:
        # Rapid OCR or Easy OCr
        ocr_options = EasyOcrOptions(force_full_page_ocr=True, lang=ocr_lang)
        pipeline_options.ocr_options = ocr_options
    
    # Device acceleration
    device_type = AcceleratorDevice.CUDA if device.upper() == 'CUDA' else AcceleratorDevice.CPU if device.upper() == 'CPU' else AcceleratorDevice.AUTO if device.upper() == 'AUTO' else AcceleratorDevice.AUTO
    pipeline_options.accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device_type)
    
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    return converter

In [72]:
converter = create_converter('CPU', 1, force_ocr=True)

In [None]:
source="/home/rafael-dias/Downloads/legendado.pdf"
conv: ConversionResult = converter.convert(str(source)) # use str instead of Path

In [None]:
from typing import TypedDict, List, Dict, Any
import docling
import traceback

class Data(TypedDict):
    metadata: Dict[str, Any]
    pages: List[Dict[str, Any]]
    images: List[Dict[str, Any]]
    tables: List[Dict[str, Any]]
    captions: List[Dict[str, Any]]

def format_results(conv: ConversionResult, data: Data, filename: str, image_path: str) -> bool:
    counter = 0
    try:
        for idx, (item, _) in enumerate(conv.document.iterate_items()):
            if isinstance(item, TextItem):
                page = item.prov[0].page_no
                label = item.label
                text = item.text

                page_found = False
                for page_dict in data['pages']:
                    if page_dict['page_number'] == page:
                        page_found = True
                        if 'content' not in page_dict:
                            page_dict['content'] = ""
                        match label:
                            case DocItemLabel.SECTION_HEADER:
                                page_dict['content'] += f"\n#{text}\n"
                            case DocItemLabel.FORMULA:
                                page_dict['content'] += f" Equation: {text}\n"
                            case DocItemLabel.REFERENCE:
                                page_dict['content'] += f"\nReference: {text}\n"
                            case DocItemLabel.LIST_ITEM:
                                page_dict['content'] += f"\n- {text}\n"
                            case DocItemLabel.CAPTION:
                                page_dict['content'] += f"_{text}_\n"
                                data['captions'].append({
                                    'self_ref' : item.self_ref,
                                    'cref' : item.parent.cref,
                                    'text' : text
                                })
                            case DocItemLabel.FOOTNOTE:
                                page_dict['content'] += f"\nFootnote: {text}\n"
                            case DocItemLabel.TITLE:
                                page_dict['content'] += f"\n##{text}_\n"
                            case DocItemLabel.TEXT:
                                page_dict['content'] += f" {text}"
                            case _:
                                page_dict['content'] += f" {text}"
                        break
                        
                if not page_found:
                    new_page = {'page_number': page, 'content': text}
                    data['pages'].append(new_page)
    
            elif isinstance(item, TableItem):
                table = item.export_to_markdown(doc=conv.document)
                self_ref = item.self_ref
                captions = item.captions
                references = item.references
                footnotes = item.footnotes
                page = item.prov[0].page_no
    
                page_found = False
                for page_dict in data['pages']:
                    if page_dict['page_number'] == page:
                        page_found = True
                        if 'content' not in page_dict:
                            page_dict['content'] = ""
                        page_dict['content'] += f" <{self_ref}> "
                if not page_found:
                    new_page = {'page_number': page, 'content': f" <{self_ref}> "}
                    data['pages'].append(new_page)
    
                data['tables'].append({
                    'self_ref' : self_ref,
                    'captions' : captions,
                    'caption' : "",
                    'references' : references,
                    'footnotes' : footnotes,
                    'page' : page, 
                    'table' : table
                })
    
            elif isinstance(item, PictureItem):
                self_ref = item.self_ref
                captions = item.captions
                references = item.references
                footnotes = item.footnotes
                page = item.prov[0].page_no
                classification = None
                confidence = None
                if item.annotations:
                    for annotation in item.annotations:
                        if annotation.kind == 'classification':
                            # Find the classification with the highest confidence
                            best_class = max(
                                annotation.predicted_classes,
                                key=lambda cls: cls.confidence
                            )
                            classification = best_class.class_name,
                            confidence = best_class.confidence
                            break
                image_filename = (image_path / f"{filename}_{counter}.png")
                placeholder = f"{filename}_{counter}.png"
                with image_filename.open('wb') as file:
                    item.get_image(conv.document).save(file, "PNG")
                data['images'].append({
                    'ref': placeholder,
                    'self_ref' : self_ref,
                    'captions' : captions,
                    'caption' : "",
                    'classification' : classification,
                    'confidence' : confidence,
                    'references' : references,
                    'footnotes' : footnotes,
                    'page' : page,
                })
                counter += 1
    
                page_found = False
                for page_dict in data['pages']:
                    if page_dict['page_number'] == page:
                        page_found = True
                        if 'content' not in page_dict:
                            page_dict['content'] = ""
                        page_dict['content'] += f" <{placeholder}> "
                if not page_found:
                    new_page = {'page_number': page, 'content': f" <{placeholder}> "}
                    data['pages'].append(new_page)


        caption_dict = {caption["cref"]: caption["text"] for caption in data.get("captions", [])}
    
        for image in data.get("images", []):
            self_ref = image.get("self_ref")
            if self_ref in caption_dict:
                caption_text = caption_dict[self_ref]
                if caption_text not in image.get("captions", []):
                    imagem["caption"] += caption_text

        
        return True
    except Exception as e:
        print(e)
        traceback.print_exc()


def json_serializable(obj):
    """Função auxiliar para tornar objetos personalizados serializáveis em JSON."""
    if hasattr(obj, '__dict__'):
        return obj.__dict__
    else:
        return str(obj)

data: Data = {
    "metadata": {},
    "pages" : [],
    "images": [],
    "tables": [],
    "captions" : []
}
format_results(conv, data, Path("filename"), Path("./"))
# Correção 2: Adicionando default=json_serializable ao json.dumps
print(json.dumps(data, indent=4, ensure_ascii=False, default=json_serializable))