In [1]:
import json
import logging
from pathlib import Path
import time
import pandas as pd

In [2]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
    ExcelFormatOption,
    CsvFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from langchain_text_splitters import MarkdownHeaderTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
input_paths = [
    Path(r"Input_Files\Brief Digital-Reward-Solution-for-JIBY.pdf"),
    #Path(r"User_File\Cost Tracker Design A.xlsx"),
    #Path(r"User_File\Cost Tracker Design B.xlsx"),
    #Path(r"User_File\IBERIA_Consumer_Monitoring_2024_Agency_Brief_February_5_2024.pptx"),
    #Path(r"User_File\Result Iberia_Proposal_Monitoring_Consuming_2024.xlsx")
]

In [23]:
doc_converter = (
    DocumentConverter(  # all of the below is optional, has internal defaults.
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
            InputFormat.ASCIIDOC,
            InputFormat.CSV,
            InputFormat.MD,
            InputFormat.XLSX,
        ],  # whitelist formats, non-matching files are ignored.
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline  # , backend=MsWordDocumentBackend
            ),
        },
    )
)

conv_results = doc_converter.convert_all(input_paths)

In [24]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [25]:
def md_chunk_strat(headers_to_split_on,markdown_document):
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on,strip_headers=False)
    md_header_splits = markdown_splitter.split_text(markdown_document)
    return md_header_splits
    

In [26]:
def doc_converter_for_excel(conv_res,output_dir,conv_file_names,md_tables):
    doc_filename = conv_res.input.file.stem

    # Export tables
    for table_ix, table in enumerate(conv_res.document.tables):
        table_df: pd.DataFrame = table.export_to_dataframe()
        #print(f"## Table {table_ix}")
        #print(table_df.to_markdown())

        with (output_dir / f"{doc_filename}-table-{table_ix + 1}.md").open("w",encoding="utf-8") as fp:
            fp.write(table_df.to_markdown())
        
        conv_file_names.append(f"{doc_filename}-table-{table_ix + 1}.md")

        chunks = md_chunk_strat(headers_to_split_on,table_df.to_markdown())
        for c in chunks:
            md_tables.append(c)
    
    return conv_file_names,md_tables

In [27]:
conv_file_names=[]
md_docs=[]


for res in conv_results:
    out_path = Path("scratch")
    print(
        f"Document {res.input.file.name} converted."
        f"\nSaved markdown output to: {out_path!s}"
    )
    
    # Export Docling document format to markdowndoc:
    
    if res.input.file.name.endswith(".xlsx"):
        conv_file_names,md_docs = doc_converter_for_excel(res,out_path,conv_file_names,md_docs)

    else:
        with (out_path / f"{res.input.file.stem}.md").open("w",encoding = "utf-8") as fp:
            fp.write(res.document.export_to_markdown())
        conv_file_names.append(f"{res.input.file.stem}.md")
        
        chunks = md_chunk_strat(headers_to_split_on,res.document.export_to_markdown())
        for c in chunks:
            md_docs.append(c)
        

Document Brief Digital-Reward-Solution-for-JIBY.pdf converted.
Saved markdown output to: scratch


In [28]:
md_docs

[Document(metadata={'Header 2': 'D IG ITA L R EW A R D S O LU T IO N FO R'}, page_content='## D IG ITA L R EW A R D S O LU T IO N FO R  \n<!-- image -->  \n<!-- image -->  \nThis presentation outlines the requirements for a digital reward solution to enhance the user experience and loyalty program of JIBY, our trade engagement platform.  \nMR  \npreencoded.png'),
 Document(metadata={'Header 2': 'Objective: Amplifying Engagement Through Rewards'}, page_content='## Objective: Amplifying Engagement Through Rewards'),
 Document(metadata={'Header 2': 'Enhanced User Experience'}, page_content='## Enhanced User Experience'),
 Document(metadata={'Header 2': 'Strengthened Loyalty Program'}, page_content='## Strengthened Loyalty Program  \nProvide engaging and valuable rewards that incentivize user participation and drive loyalty.  \nDevelop a comprehensive rewards system that fosters lasting partnerships and strengthens customer relationships.  \npreencoded.png  \n<!-- image -->'),
 Document(me

In [160]:
from langchain_community.document_loaders import UnstructuredExcelLoader

In [161]:
loader = UnstructuredExcelLoader(r"User_File\Cost Tracker Design A.xlsx", mode="elements")
docs = loader.load()

In [167]:
print(docs[38].metadata['page_number'])
print(docs[38].page_content)

2
Une fois le fichier complété, merci le joindre en cliquant sur le lien "Charger la grille de cotation Excel" de l'onglet "Grille de cotation".


In [168]:
len(docs)

245

In [147]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

In [150]:
def main():
    input_doc = Path(r"User_File\Cost Tracker Design A.xlsx")

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
    # ocr_options = OcrMacOptions(force_full_page_ocr=True)
    # ocr_options = RapidOcrOptions(force_full_page_ocr=True)
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    doc = converter.convert(input_doc).document
    md = doc.export_to_markdown()
    print(md)

In [151]:
if __name__ == "__main__":
    main()

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipeline:Processing document Cost Tracker Design A.xlsx
INFO:docling.backend.msexcel_backend:Processing sheet: Instructions
INFO:docling.backend.msexcel_backend:Processing sheet: manual_prop
INFO:docling.backend.msexcel_backend:Processing sheet: manual_costdriver
INFO:docling.backend.msexcel_backend:Processing sheet: __base_label
INFO:docling.backend.msexcel_backend:Processing sheet: __base_code
INFO:docling.backend.msexcel_backend:Processing sheet: style_rfx
INFO:docling.backend.msexcel_backend:Processing sheet: style_prop
INFO:docling.backend.msexcel_backend:Processing sheet: style_costdriver
INFO:docling.backend.msexcel_backend:Processing sheet: Columns
INFO:docling.backend.msexcel_backend:Processing sheet: Grids
INFO:docling.backend.msexcel_backend:Processing sheet: R

| Instructions                                                                                                             |
|--------------------------------------------------------------------------------------------------------------------------|
| None                                                                                                                     |
| Vous trouverez dans ce fichier contient les données nécessaires à la création et la mise à jour  des items de votre RFx. |

| Le premier onglet, "Colonnes" contient le dictionnaire des colonnes utilisables dans les différentes grilles d'items.  Cet onglet est obligatoire, merci de ne pas le supprimer ni le renommer.                                                                                                                                                                                                             |
|--------------------------------------------------------------------------------------------------

In [1]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def main():
    #logging.basicConfig(level=logging.INFO)

    input_doc_path = Path(r"Input_Files\Brief Digital-Reward-Solution-for-JIBY.pdf")

    ###########################################################################

    # The following sections contain a combination of PipelineOptions
    # and PDF Backends for various configurations.
    # Uncomment one section at the time to see the differences in the output.

    # PyPdfium without EasyOCR
    # --------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = False
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = False

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(
    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
    #         )
    #     }
    # )

    # PyPdfium with EasyOCR
    # -----------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(
    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
    #         )
    #     }
    # )

    # Docling Parse without EasyOCR
    # -------------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = False
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with EasyOCR
    # ----------------------
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    pipeline_options.ocr_options.lang = ["es"]
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=4, device=AcceleratorDevice.AUTO
    )

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Docling Parse with EasyOCR (CPU only)
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.ocr_options.use_gpu = False  # <-- set this.
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with Tesseract
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractOcrOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with Tesseract CLI
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractCliOcrOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    # Docling Parse with ocrmac(Mac only)
    # ----------------------
    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = OcrMacOptions()

    # doc_converter = DocumentConverter(
    #     format_options={
    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    #     }
    # )

    ###########################################################################

    start_time = time.time()
    conv_result = doc_converter.convert(input_doc_path)
    end_time = time.time() - start_time

    

    ## Export results
    output_dir = Path("scratch")
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_result.input.file.stem


    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_markdown())

In [7]:
if __name__ == "__main__":
    main()