installing dependencies

In [None]:
!python -m spacy download fr_core_news_md
!pip install spacy spacy-layout

test the model on one single pdf

In [None]:
import spacy
from spacy_layout import spaCyLayout

# ===========================
# 1Ô∏è‚É£ Load models
# ===========================
# Layout model (for PDF parsing)
nlp_layout = spacy.load("fr_core_news_md")
layout = spaCyLayout(nlp_layout)

# Text classification model
model_path = "./model"
nlp_classifier = spacy.load(model_path)

# ===========================
# 2Ô∏è‚É£ Process PDF
# ===========================
pdf_path = "t.pdf"
doc = layout(pdf_path)

# ===========================
# 3Ô∏è‚É£ Extract items from layout spans
# ===========================
items = []
for span in doc.spans.get("layout", []):
    label = span.label_.lower()
    text = span.text.strip()
    if text:
        items.append({"label": label, "text": text})

# ===========================
# 4Ô∏è‚É£ Group headers + content
# ===========================
structured_doc = {}
current_header_parts = []
current_content = []

for item in items:
    label = item["label"]
    text = item["text"]

    if label in ["section_header", "title", "heading", "section-header", "header"]:
        if current_content:
            merged_header = "\n".join(current_header_parts) if current_header_parts else "Introduction"
            structured_doc[merged_header] = "\n".join(current_content)
            current_content = []
            current_header_parts = [text]
        else:
            current_header_parts.append(text)

    elif label in ["paragraph", "body", "text", "list_item", "section"]:
        if not current_header_parts:
            current_header_parts = ["Introduction"]
        current_content.append(text)

# Save the last block
if current_header_parts and current_content:
    merged_header = "\n".join(current_header_parts)
    structured_doc[merged_header] = "\n".join(current_content)

# ===========================
# 5Ô∏è‚É£ Classify each block
# ===========================
results = []

for header, content in structured_doc.items():
    merged_text = header + "\n" + content
    doc_pred = nlp_classifier(merged_text)
    predicted_category = max(doc_pred.cats, key=doc_pred.cats.get)

    results.append({
        "merged_text": merged_text,
        "predicted_category": predicted_category
    })

# ===========================
# 6Ô∏è‚É£ Print results
# ===========================
for r in results:
    print("üìå Merged Text:\n", r["merged_text"])
    print("Predicted Category:", r["predicted_category"])
    print("-" * 40)
