In [None]:
from pandas import read_csv
from pathlib import Path
from huggingface_hub import hf_hub_download
from ultralytics import YOLO
from pdf2image import convert_from_path
from PIL import Image
from re import split
from collections import defaultdict
from numpy import array, asarray
from transformers import LayoutLMv3ForTokenClassification
import sys
sys.path.append('../../layoutreader')
from v3.helpers import boxes2inputs, prepare_inputs, parse_logits
from torch import no_grad
import pandas as pd
import pymupdf

In [None]:
output_dir = Path("../../data/pdf_extraction_data")

annotation_dir = Path(output_dir) / Path("annotation")

text_dir = Path(output_dir) / Path("txt")
text_dir.mkdir(parents=True, exist_ok=True)

img_dir = Path(output_dir) / Path("img")
img_dir.mkdir(parents=True, exist_ok=True)

metadata_path = Path(output_dir) / Path("metadata.csv")

In [None]:
def group_words(regions, words):
    # Assign IDs and compute area for regions
    for idx, region in enumerate(regions):
        region['id'] = idx
        x0, y0, x1, y1 = region['bbox']
        region['area'] = (x1 - x0) * (y1 - y0)
    
    # Sort regions by area (smallest first)
    regions_sorted = sorted(regions, key=lambda r: r['area'])
    
    grouped_words = defaultdict(list)
    #grouped_words_text = defaultdict(list)
    unassigned = []
    
    # Assign words to regions
    for word in words:
        x0, y0, x1, y1 = word[0:4]
        center_x, center_y = (x0 + x1) / 2, (y0 + y1) / 2
        assigned = False
        
        for region in regions_sorted:
            r_x0, r_y0, r_x1, r_y1 = region['bbox']
            if r_x0-10 <= center_x <= r_x1+10 and r_y0-10 <= center_y <= r_y1+10:
                grouped_words[region['id']].append(word)
                assigned = True
                break
                
        if not assigned:
            unassigned.append(word)
    
    # Prepare result (sort words in reading order)
    result = []
    for region in regions:
        words_in_region = grouped_words.get(region['id'], [])
        words_sorted = sorted(words_in_region, key=lambda w: (w[1]))  # Sort by y0, then x0
        words_sorted = sort_by_line(words_sorted)
        result.append({
            "region_id": region['id'],
            "label": region['label'],
            "bbox": region['bbox'],
            "words": words_sorted
        })
    
    return result, unassigned

def sort_by_line(data):
    buckets = defaultdict(list)
    
    for item in data:
        _, y0, *_ = item[:4]
        bucket = int(y0) // 10          # integer division → 10-pixel band
        buckets[bucket].append(item)

    # Produce a list of groups ordered top-to-bottom (smallest y0 first)
    grouped_lines = []
    for bucket in sorted(buckets):                     # top-to-bottom
        line = sorted(buckets[bucket], key=lambda t: t[0])   # left-to-right
        grouped_lines.append(line)

    flat = [item for sublist in grouped_lines for item in sublist]
    return flat

def rescale_bboxes(bboxes_xyxy,                  # (N,4) or (4,)  -> x0,y0,x1,y1
                   src_size,
                   dst_size):
    """
    Linearly rescales axis-aligned boxes from one pixel/grid space to another.

    Parameters
    ----------
    bboxes_xyxy : array-like
        Box coordinates in the *source* space.
    src_size    : (w, h) tuple
    dst_size    : (w, h) tuple

    Returns
    -------
    numpy.ndarray  of shape (..., 4) in the destination space.
    """
    b = asarray(bboxes_xyxy, dtype=float)
    sx = dst_size[0] / src_size[0]
    sy = dst_size[1] / src_size[1]
    scale = array([sx, sy, sx, sy])
    return b * scale

In [None]:
weight_path = hf_hub_download(
        repo_id="hantian/yolo-doclaynet",
        filename="yolov12s-doclaynet.pt",
        cache_dir="../../.hf_cache",)

In [None]:
df = pd.read_csv(metadata_path)
new_pdf = False
total_unassigned = 0
total_tokens = 0
for i, row in df.iterrows():
    if not row["text"]:
        annotation_file = row["annotation"]
        pdf_file = row["pdf_path"]
        arxiv_id = row['arxiv_id']
        
        doc = pymupdf.open(pdf_file)
        for page in doc.pages(0, 1, 1):
            w,h = page.mediabox[-2], page.mediabox[-1]

        pages_as_image = convert_from_path(pdf_file, dpi=300)
        model   = YOLO(weight_path)
        preds = model(pages_as_image[0])[0]

        regions = []
        labels = preds.names
        for xyxy, cls, score in zip(preds.boxes.xyxy, preds.boxes.cls, preds.boxes.conf):
            label = labels[int(cls)]
            bbox = xyxy.tolist()
            regions.append({"label": label, "bbox": bbox})

        annotated = preds.plot()
        Image.fromarray(annotated[..., ::-1]).save(img_dir / f"{Path(pdf_file).stem}.png")

        annotation = annotation_dir / Path(annotation_file)      # one page
        boxes = []

        for line in annotation.read_text(encoding="utf-8").splitlines():
            parts = split(r"\s+", line.rstrip())
            if parts[0] != "##LTLine##":
                x0, y0, x1, y1 = map(int, parts[1:5])  # already 0-1000
                boxes.append([x0, y0, x1, y1, parts[0]])
                total_tokens+=1

        for r in regions:
            r["bbox"] = rescale_bboxes(r["bbox"], preds.orig_shape, (h, w))

        norm = lambda v, maxv: int(1000 * v / maxv)
        for r in regions:
            r["bbox"] = [norm(r["bbox"][0],w), norm(r["bbox"][1],h), norm(r["bbox"][2],w), norm(r["bbox"][3],h)]

        grouped_data, unassigned = group_words(regions, boxes)

        boxes = []
        tokens = []
        for g in grouped_data:
            if g["words"]:
                boxes.append((min(item[0] for item in g["words"]), min(item[1] for item in g["words"]), max(item[2] for item in g["words"]), max(item[3] for item in g["words"])))
                tokens.append(" ".join([item[4] for item in g["words"]]))
        if unassigned:
            for u in unassigned:
                boxes.append(tuple(u[:4]))
                tokens.append(u[4])
                total_unassigned+=1
        model  = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader").to("cuda")
        inputs = boxes2inputs(boxes)
        inputs = prepare_inputs(inputs, model)     # pads/truncates for the model

        with no_grad():
            logits = model(**inputs).logits.squeeze(0)   # (n_boxes, n_classes)

        order = parse_logits(logits, len(boxes))         # e.g. [3,0,1,2,…]

        # rearrange text (or any metadata) with the predicted order
        y = " ".join(tokens[i].strip() for i in order)

        text_file_path = text_dir / Path(f"{arxiv_id}.txt")
        with open(text_file_path, "w", encoding="utf-8") as text_file:
            text_file.write(y)

        df.loc[df["arxiv_id"]==arxiv_id, "text"] = str(text_file_path)

        new_pdf = True

if new_pdf: 
    df.to_csv(metadata_path, index=False)
    if total_tokens>0: print(f"{round(total_unassigned/total_tokens * 100, 2)}% tokens unassigned")
else: print('Files already exist')