In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable
from IPython.display import Markdown, display
from unstructured.partition.pdf import partition_pdf
import chromadb
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def extract_markdown_from_pdf_unstructured(
    pdf_path: str | Path,
    *,
    strategy: str = "auto",   # auto | fast | hi_res | ocr_only
) -> str:
    """
    Extract text from a PDF using Unstructured and return Markdown-formatted text.
    Suitable for chunking & embedding (RAG).
    """
    pdf_path = Path(pdf_path)

    elements = partition_pdf(
        filename=pdf_path,
        strategy=strategy,
        infer_table_structure=True,
        extract_images=False,
    )

    md_lines: list[str] = []

    for e in elements:
        text = getattr(e, "text", None)
        if not text:
            continue

        text = text.strip()
        if not text:
            continue

        category = e.category

        # ---------- Titles ----------
        if category == "Title":
            md_lines.append(f"# {text}")

        # ---------- Tables ----------
        elif category == "Table":
            md_lines.append("\n```text")
            md_lines.append(text)
            md_lines.append("```\n")

        # ---------- Lists ----------
        elif category == "ListItem":
            md_lines.append(f"- {text}")

        # ---------- Normal paragraphs ----------
        else:
            md_lines.append(text)

    # Clean spacing
    markdown = "\n\n".join(md_lines)
    return markdown.strip()


In [None]:
# Example usage
pdf_file = r"knowledge-base/raw/English Certificate - Bangkit.pdf"
text = extract_markdown_from_pdf_unstructured(pdf_file)

In [None]:
text[:500]

In [None]:
display(Markdown(text[:500]))

In [None]:
def merge_label_value(lines: list[str]) -> list[str]:
    merged = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()

        if (
            i + 1 < len(lines)
            and len(line) < 40
            and lines[i + 1].lstrip().startswith((":","- :"))
        ):
            value = lines[i + 1].replace("- :", ":").strip()
            merged.append(f"{line}{value}")
            i += 2
        else:
            merged.append(line)
            i += 1

    return merged

def dedupe_consecutive(lines: list[str]) -> list[str]:
    out = []
    prev = None
    for line in lines:
        if line != prev:
            out.append(line)
        prev = line
    return out

def clean_lines(md: str) -> list[str]:
    return [l.strip() for l in md.splitlines() if l.strip()]

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

In [None]:
normalized_text = normalize_markdown(text)
print(normalized_text)

In [None]:
display(Markdown(normalized_text[:500]))

In [2]:
from implementation.visualize import visualize_chroma

In [3]:
visualize_chroma()

Running t-SNE for 2D... (n_samples=15, perplexity=4)
Running t-SNE for 3D... (n_samples=15, perplexity=4)
