In [None]:
!pip install docling

Collecting docling
  Downloading docling-2.14.0-py3-none-any.whl.metadata (7.7 kB)
Collecting deepsearch-glm<2.0.0,>=1.0.0 (from docling)
  Downloading deepsearch_glm-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.12.1 (from docling-core[chunking]<3.0.0,>=2.12.1->docling)
  Downloading docling_core-2.12.1-py3-none-any.whl.metadata (5.7 kB)
Collecting docling-ibm-models<4.0.0,>=3.1.0 (from docling)
  Downloading docling_ibm_models-3.1.0-py3-none-any.whl.metadata (7.0 kB)
Collecting docling-parse<4.0.0,>=3.0.0 (from docling)
  Downloading docling_parse-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting marko<3.0.0,>=2.1.2 (from docling)
  Downloading marko-2.

# OCR + Layout Detection

In [None]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
)

# to explicitly prefetch:
artifacts_path = StandardPdfPipeline.download_models_hf()

ocr_options = EasyOcrOptions(lang=['ru', 'en'])

pipeline_options = PdfPipelineOptions(do_table_structure=True, artifacts_path=artifacts_path)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.images_scale = 1.0
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
pipeline_options.ocr_options = ocr_options

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from docling.datamodel.document import ConversionResult

conv_result: ConversionResult = doc_converter.convert("manual.pdf") # previously `convert_single`




In [None]:
import cv2
import numpy as np

# def get_thickness(img):
#     i = 0
#     while img.any():
#         i += 1
#         img = cv2.erode(img, kernel=np.ones((3, 3)))
#     return i

def get_thickness(binary):
    width = (binary > 128).sum(1)
    count = (np.diff(binary, axis=1) > 128).sum(1)
    thickness = width[count > 0] / count[count > 0]
    return np.median(thickness[~np.isnan(thickness)])


def get_font_style(image, min_char_area=10):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)

    _, binary = cv2.threshold(gray.copy(), 128, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bounding_boxes = [cv2.boundingRect(contour) for contour in contours]

    bounding_boxes = [bbox for bbox in bounding_boxes if bbox[2] * bbox[3] > min_char_area]

    heights = [bbox[3] for bbox in bounding_boxes]
    if len(heights) > 1:
        fontsize = np.quantile(heights, 0.5)
        thickness = get_thickness(binary)
    else:
        fontsize = None
        thickness = None
    return fontsize, thickness


In [None]:
CAPTION = "caption"
FOOTNOTE = "footnote"
FORMULA = "formula"
LIST_ITEM = "list_item"
PAGE_FOOTER = "page_footer"
PAGE_HEADER = "page_header"
PICTURE = "picture"
SECTION_HEADER = "section_header"
TABLE = "table"
TEXT = "text"
TITLE = "title"
DOCUMENT_INDEX = "document_index"
CODE = "code"
CHECKBOX_SELECTED = "checkbox_selected"
CHECKBOX_UNSELECTED = "checkbox_unselected"
FORM = "form"
KEY_VALUE_REGION = "key_value_region"


In [None]:
from pathlib import Path
# import pandas as pd
from docling_core.types.doc.document import (
    TextItem, PictureItem, SectionHeaderItem, TableItem, ListItem)
from PIL import Image, ImageDraw


def draw_bbox(image_pth, bbox, title):
    color = 'blue'
    if title in [SECTION_HEADER, TITLE]:
        color = 'green'
    elif title in [TABLE, PICTURE]:
        color = 'red'
    elif title == CAPTION:
        color = 'orange'
    # draw boxes
    image = Image.open(image_pth)
    draw = ImageDraw.Draw(image)
    x1, y1, x2, y2 = bbox.l, image.size[1] - bbox.t, bbox.r, image.size[1] - bbox.b
    draw.rectangle((x1, y1, x2, y2), outline=color, width=2)
    draw.text((x1, y1), title, fill="red")
    image.save(image_pth)


images_dir = Path("images")
images_dir.mkdir(parents=True, exist_ok=True)

pages_dir = Path("pages")
pages_dir.mkdir(parents=True, exist_ok=True)

# Save page images
for page_no, page in conv_result.document.pages.items():
    page_no = page.page_no
    page_image_filename = pages_dir / f"{page_no}.png"
    with page_image_filename.open("wb") as fp:
        page.image.pil_image.save(fp, format="PNG")


picture_counter = 0
annotations = []
idx = 1
## Iterate the elements in reading order, including hierachy level:
for item, level in conv_result.document.iterate_items():
    if item.label in [PAGE_FOOTER, PAGE_HEADER]:
        continue
    prov_item = item.prov[0]
    # draw_bbox(pages_dir / f"{prov_item.page_no}.png", prov_item.bbox, item.label.lower())
    # print(level, type(item))
    image_pth = pages_dir / f"{prov_item.page_no}.png"
    bbox = prov_item.bbox
    image = Image.open(image_pth)
    x1, y1, x2, y2 = bbox.l, image.size[1] - bbox.t, bbox.r, image.size[1] - bbox.b
    item_image = image.crop((x1, y1, x2, y2))
    item_data = {
        'id': idx,
        'bbox': [(x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1],
        'page': prov_item.page_no
    }
    if not all(item_image.size):
        continue
    if isinstance(item, TextItem):
        print(item.text)
        fontsize, thickness = get_font_style(item_image, min_char_area=10)
        # print(f"Estimated font size: {fontsize}")
        # print(f"Estimated thickness: {thickness}")
        content = {
            'text': item.text,
            'type': 'h' if item.label.lower() in [SECTION_HEADER, TITLE] else 'p',
            'fontsize': fontsize,
            'thickness': thickness
        }
    elif isinstance(item, TableItem) and not isinstance(item, ListItem):
        # table_df: pd.DataFrame = item.export_to_dataframe()
        # print(table_df.to_markdown())
        # print(item.export_to_html())
        content = {
            'html': item.export_to_html(),
            'type': 'table'
        }
    elif isinstance(item, PictureItem):
        picture_counter += 1
        element_image_filename = images_dir / f"picture-{picture_counter}.png"
        with element_image_filename.open("wb") as fp:
            item.get_image(conv_result.document).save(fp, "PNG")
        content = {
            'src': str(element_image_filename),
            'type': 'img'
        }
    elif isinstance(item, ListItem):
        # print(item.export_to_html())
        content = {
            'text': item.text,
            'type': 'li',
            'fontsize': fontsize,
            'thickness': thickness
        }
    item_data.update(content)
    annotations.append(item_data)
    idx += 1


КриптоПро PDF
Estimated font size: 25.0
Estimated thickness: 5.459595959595959
ВЕРСИЯ 1.6
Estimated font size: 11.0
Estimated thickness: 2.6948051948051948
Руководство по эксплуатации
Estimated font size: 10.0
Estimated thickness: 2.475
АННОТАЦИЯ
Estimated font size: 10.0
Estimated thickness: 1.8666666666666667
Настоящий документ содержит описание процесса установки и инструкции по использованию модуля создания и проверки электронной подписи «КриптоПро PDF» в программах Adobe Reader и Adobe Acrobat.
Estimated font size: 5.0
Estimated thickness: 1.3333333333333333
«КриптоПро PDF» представляет собой средство создания и проверки электронной подписи  для  файлов  в  формате  PDF,  предназначенное  для  использования  совместно  со  средством криптографической защиты информации КриптоПро CSP.
Estimated font size: 6.0
Estimated thickness: 1.275
Информация о разработчике «КриптоПро PDF»:
Estimated font size: 6.0
Estimated thickness: 2.7122015915119366
ООО «Крипто-Про»
Estimated font size: 7.0

In [None]:
import json

with open("annotations.json", "w") as f:
    json.dump(annotations, f)


# Postprocessing

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import defaultdict
import json


def header_clusters(headers_info):
    # Extract features
    data = np.array([[item['fontsize'], item['thickness']] for item in headers_info])

    # Standardize the data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=2)
    labels = dbscan.fit_predict(data_scaled)

    # Assign cluster labels to headers
    for i, item in enumerate(headers_info):
        item['cluster'] = labels[i]

    # Assign cluster labels to headers
    for i, item in enumerate(headers_info):
        item['cluster'] = labels[i]

    # Group headers by clusters and compute mean values
    clusters = defaultdict(list)
    for item in headers_info:
        if item['cluster'] != -1:
            clusters[item['cluster']].append(item)

    cluster_stats = []
    for cluster_id, items in clusters.items():
        mean_fontsize = np.mean([item['fontsize'] for item in items])
        mean_thickness = np.mean([item['thickness'] for item in items])
        cluster_stats.append({
            'cluster_id': cluster_id,
            'mean_fontsize': mean_fontsize,
            'mean_thickness': mean_thickness,
            'items': items
        })

    # Sort clusters based on mean fontsize and thickness
    cluster_stats_sorted = sorted(
        cluster_stats,
        key=lambda x: (x['mean_fontsize'], x['mean_thickness']),
        reverse=True
    )
    return cluster_stats_sorted


def convert_annotations_to_markdown(annotations):
    # Parse the JSON annotations
    items = annotations

    # Collect font sizes and thicknesses of headers
    headers_info = []
    for item in items:
        if item.get('type') == 'h':
            headers_info.append({
                'id': item['id'],
                'fontsize': item.get('fontsize', 0),
                'thickness': item.get('thickness', 0),
            })
    # Headers clustering
    cluster_stats_sorted = header_clusters(headers_info)
    # Map clusters order to MD level
    header_level_map = {
        idx: i + 1 for i, stat in enumerate(cluster_stats_sorted)
        for idx in [item['id'] for item in stat['items']]
    }
    # Build the Markdown text
    markdown_lines = []
    for item in items:
        item_type = item['type']
        if item_type == 'h':
            # Get the header level
            header_level = header_level_map.get(item['id'], 1)
            print(header_level)
            prefix = '#' * header_level  # Markdown header prefix
            markdown_lines.append(f"{prefix} {item.get('text', '')}")
            print(f"{prefix} {item.get('text', '')}")
        elif item_type == 'p':
            markdown_lines.append(item.get('text', ''))
        elif item_type == 'li':
            markdown_lines.append(f"- {item.get('text', '')}")
        elif item_type == 'img':
            src = item.get('src', '')
            markdown_lines.append(f"![Image]({src})")
        elif item_type == 'table':
            html = item.get('html', '')
            # Include HTML directly
            markdown_lines.append(html)
        else:
            # If  type is unrecognized, skip it
            pass
        # Add empty line after each item for Markdown formatting
        markdown_lines.append('')

    # Join all lines into the final Markdown text
    markdown_text = '\n'.join(markdown_lines)
    return markdown_text


# Load the annotations from JSON
with open('annotations.json') as f:
    annotations = json.loads(f.read())

# Convert annotations to Markdown

markdown_text = convert_annotations_to_markdown(annotations)

# print(markdown_text)


In [None]:
with open('./manual_annotations.md', 'w') as f:
    f.write(markdown_text)

In [None]:
# with open('./result.md', 'w', encoding='utf-16') as f:
#     f.write(conv_result.document.export_to_markdown())


In [None]:
# import shutil

# shutil.rmtree(pages_dir)


In [None]:
# %%time
# !docling tuo.pdf --to md

Fetching 9 files:   0% 0/9 [00:00<?, ?it/s]Fetching 9 files: 100% 9/9 [00:00<00:00, 107240.73it/s]
CPU times: user 403 ms, sys: 48.5 ms, total: 452 ms
Wall time: 1min 10s


## Libre-office

In [None]:
# !apt-get install libreoffice

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor default-jre default-jre-headless dictionaries-common firebird3.0-common
  firebird3.0-common-doc firebird3.0-server-core firebird3.0-utils fonts-crosextra-caladea
  fonts-crosextra-carlito fonts-dejavu fonts-dejavu-core fonts-dejavu-extra fonts-liberation2
  fonts-linuxlibertine fonts-noto-core fonts-noto-extra fonts-noto-mono fonts-noto-ui-core
  fonts-opensymbol fonts-sil-gentium fonts-sil-gentium-basic gstreamer1.0-gl gstreamer1.0-gtk3
  hunspell-en-us libabsl20210324 libabw-0.1-1 libatk-wrapper-java libatk-wrapper-java-jni
  libbsh-java libcdr-0.1-1 libclucene-contribs1v5 libclucene-core1v5 libcolamd2 libe-book-0.1-1
  libel-api-java libeot0 libepubgen-0.1-1 libetonyek-0.1-1 libexttextcat-2.0-0 libexttextcat-data
  libfbclient2 libfontenc1 libfreehand-0.1-1 libgpgme11 libgpgmepp6 libgraphene-1.0-0
  libgstreamer-gl1.0-0 lib

In [None]:
# !soffice --headless --convert-to html:HTML st6.docx

convert /content/st6.docx -> /content/st6.html using filter : HTML
