In [1]:
# Cell 1: Imports
import json
from pathlib import Path
import random

In [3]:
# Cell 2: Configs
SPAN_LOG_FILE = Path("logs/spans.jsonl")
TRAIN_OUTPUT_FILE = Path("sample_dataset/train/train.jsonl")
LABELS = ["H1", "H2", "NONE"]

In [5]:
# Cell 3: Load all spans
with open(SPAN_LOG_FILE, "r", encoding="utf-8") as f:
    all_spans = [json.loads(line) for line in f]

# Remove duplicates
seen = set()
unique_spans = []
for span in all_spans:
    key = (span["text"], span["page"])
    if key not in seen:
        seen.add(key)
        unique_spans.append(span)

# Shuffle for random labeling
random.shuffle(unique_spans)
print(f"Loaded {len(unique_spans)} unique spans for labeling.")

Loaded 409 unique spans for labeling.


In [7]:
# Cell 4: Define interactive labeling loop
from IPython.display import display, HTML
from ipywidgets import interact_manual, Dropdown

idx = 0
labeled_data = []

def label_span(label):
    global idx
    if idx >= len(unique_spans):
        print("✅ All spans labeled.")
        return
    span = unique_spans[idx]
    span["label"] = label
    labeled_data.append(span)
    idx += 1

@interact_manual(label=Dropdown(options=LABELS))
def start_labeling(label="NONE"):
    span = unique_spans[idx]
    display(HTML(f"""
        <h4>PDF Page: {span['page']}</h4>
        <pre style="font-size: 16px; background:#f4f4f4; padding:10px;">{span['text']}</pre>
        <p><b>Font Size:</b> {span['font_size']} | 
           <b>Bold:</b> {span['is_bold']} | 
           <b>All Caps:</b> {span['is_all_caps']} | 
           <b>Centered:</b> {span['x_centered']:.2f}
        </p>
    """))
    label_span(label)

interactive(children=(Dropdown(description='label', options=('H1', 'H2', 'NONE'), value='H1'), Button(descript…

In [9]:
# Cell 5: Save labeled data to JSONL
def save_labeled():
    TRAIN_OUTPUT_FILE.parent.mkdir(exist_ok=True, parents=True)
    with open(TRAIN_OUTPUT_FILE, "w", encoding="utf-8") as f:
        for row in labeled_data:
            json.dump(row, f)
            f.write("\n")
    print(f"✅ Saved {len(labeled_data)} labeled spans to: {TRAIN_OUTPUT_FILE}")

save_labeled()

✅ Saved 0 labeled spans to: sample_dataset\train\train.jsonl
