In [2]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable
from IPython.display import Markdown, display
from unstructured.partition.pdf import partition_pdf

In [3]:
def extract_markdown_from_pdf_unstructured(
    pdf_path: str | Path,
    *,
    strategy: str = "auto",   # auto | fast | hi_res | ocr_only
) -> str:
    """
    Extract text from a PDF using Unstructured and return Markdown-formatted text.
    Suitable for chunking & embedding (RAG).
    """
    pdf_path = Path(pdf_path)

    elements = partition_pdf(
        filename=pdf_path,
        strategy=strategy,
        infer_table_structure=True,
        extract_images=False,
    )

    md_lines: list[str] = []

    for e in elements:
        text = getattr(e, "text", None)
        if not text:
            continue

        text = text.strip()
        if not text:
            continue

        category = e.category

        # ---------- Titles ----------
        if category == "Title":
            md_lines.append(f"# {text}")

        # ---------- Tables ----------
        elif category == "Table":
            md_lines.append("\n```text")
            md_lines.append(text)
            md_lines.append("```\n")

        # ---------- Lists ----------
        elif category == "ListItem":
            md_lines.append(f"- {text}")

        # ---------- Normal paragraphs ----------
        else:
            md_lines.append(text)

    # Clean spacing
    markdown = "\n\n".join(md_lines)
    return markdown.strip()


In [10]:
# Example usage
pdf_file = r"knowledge-base/raw/English Certificate - Bangkit.pdf"
text = extract_markdown_from_pdf_unstructured(pdf_file)



In [11]:
text[:500]

'TBI * 8 bangEit The British Institute British Institute Google traveloka速 TBI-DAGO/CORP/2490\n\n* 8\n\nTBI\n\nThe British Institute British Institute TBI-DAGO/CORP/2490\n\nThis is to certify that Kharisma Rizki Wijanarko\n\nThis is to certify that\n\n# Kharisma Rizki Wijanarko\n\nhas completed a short course of 4.5 hours entitled English for Business Communication\n\nhas completed a short course of 4.5 hours entitled\n\n# English for Business Communication\n\nand achieved an overall score of 92% 18 January 2024\n\nan'

In [12]:
display(Markdown(text[:500]))

TBI * 8 bangEit The British Institute British Institute Google traveloka速 TBI-DAGO/CORP/2490

* 8

TBI

The British Institute British Institute TBI-DAGO/CORP/2490

This is to certify that Kharisma Rizki Wijanarko

This is to certify that

# Kharisma Rizki Wijanarko

has completed a short course of 4.5 hours entitled English for Business Communication

has completed a short course of 4.5 hours entitled

# English for Business Communication

and achieved an overall score of 92% 18 January 2024

an

In [13]:
def merge_label_value(lines: list[str]) -> list[str]:
    merged = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()

        if (
            i + 1 < len(lines)
            and len(line) < 40
            and lines[i + 1].lstrip().startswith((":","- :"))
        ):
            value = lines[i + 1].replace("- :", ":").strip()
            merged.append(f"{line}{value}")
            i += 2
        else:
            merged.append(line)
            i += 1

    return merged

def dedupe_consecutive(lines: list[str]) -> list[str]:
    out = []
    prev = None
    for line in lines:
        if line != prev:
            out.append(line)
        prev = line
    return out

def clean_lines(md: str) -> list[str]:
    return [l.strip() for l in md.splitlines() if l.strip()]

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

In [16]:
normalized_text = normalize_markdown(text)
print(normalized_text)

TBI * 8 bangEit The British Institute British Institute Google traveloka速 TBI-DAGO/CORP/2490

* 8

TBI

The British Institute British Institute TBI-DAGO/CORP/2490

This is to certify that Kharisma Rizki Wijanarko

This is to certify that

# Kharisma Rizki Wijanarko

has completed a short course of 4.5 hours entitled English for Business Communication

has completed a short course of 4.5 hours entitled

# English for Business Communication

and achieved an overall score of 92% 18 January 2024

and achieved an overall score of 92%

18 January 2024


In [15]:
display(Markdown(normalized_text[:500]))

TBI * 8 bangEit The British Institute British Institute Google traveloka速 TBI-DAGO/CORP/2490

* 8

TBI

The British Institute British Institute TBI-DAGO/CORP/2490

This is to certify that Kharisma Rizki Wijanarko

This is to certify that

# Kharisma Rizki Wijanarko

has completed a short course of 4.5 hours entitled English for Business Communication

has completed a short course of 4.5 hours entitled

# English for Business Communication

and achieved an overall score of 92% 18 January 2024

an