In [26]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable
from IPython.display import Markdown, display
from unstructured.partition.pdf import partition_pdf

In [31]:
def extract_markdown_from_pdf_unstructured(
    pdf_path: str | Path,
    *,
    strategy: str = "auto",   # auto | fast | hi_res | ocr_only
) -> str:
    """
    Extract text from a PDF using Unstructured and return Markdown-formatted text.
    Suitable for chunking & embedding (RAG).
    """
    pdf_path = Path(pdf_path)

    elements = partition_pdf(
        filename=pdf_path,
        strategy=strategy,
        infer_table_structure=True,
        extract_images=False,
    )

    md_lines: list[str] = []

    for e in elements:
        text = getattr(e, "text", None)
        if not text:
            continue

        text = text.strip()
        if not text:
            continue

        category = e.category

        # ---------- Titles ----------
        if category == "Title":
            md_lines.append(f"# {text}")

        # ---------- Tables ----------
        elif category == "Table":
            md_lines.append("\n```text")
            md_lines.append(text)
            md_lines.append("```\n")

        # ---------- Lists ----------
        elif category == "ListItem":
            md_lines.append(f"- {text}")

        # ---------- Normal paragraphs ----------
        else:
            md_lines.append(text)

    # Clean spacing
    markdown = "\n\n".join(md_lines)
    return markdown.strip()


In [36]:
# Example usage
pdf_file = r"knowledge-base/raw/[Bangkit 2023 Batch 2] Final Transcript - M283BSY0985.pdf"
text = extract_markdown_from_pdf_unstructured(pdf_file)



In [37]:
text

"AN bangE!t\n\nBangkit 2023 Final Transcript\n\nus Kor Geka D INDONESIA SAYA\n\nGenerated\n\nGenerated\n\n: 10 January 2024\n\nBangkit ID\n\n- : M283BSY0985\n\nTranscript Status\n\n: Final\n\nName\n\n- : Kharisma Rizki Wijanarko\n\nBangkit Completion\n\n: Full Graduate\n\nUniversity\n\nUniversity\n\n- : Universitas Negeri Semarang\n\nLearning Path\n\n: Machine Learning\n\nNIM\n\n- : 4611421124\n\nCapstone Team\n\n: CH2-PS053\n\nSupervisor\n\n- : Abas Setiawan, S.Kom., M.Cs.\n\nCapstone Status\n\n: Finished\n\nCourses/Specialization/Activities\n\nCourse Codes\n\nHours\n\nSuggested SKS\n\nScore (0-100)\n\n\n```text\n\nIT Automation with Python B23MLCR01 48 1 Data Analytics Structuring Machine Learning Projects B23MLCR06 B23MLCR04 95 6 2 Mathematics for Machine Learning and Data Science TensorFlow Developer Professional Certificate B23MLCR02 B23MLCR03 74 79 3 Machine Learning Specialization by Andrew Ng B23MLCR07 94 2 Tensorflow Data and Deployment B23MLCR05 51 1 Preparing for Tensorflow 

In [42]:
display(Markdown(text))

AN bangE!t

Bangkit 2023 Final Transcript

us Kor Geka D INDONESIA SAYA

Generated

Generated

: 10 January 2024

Bangkit ID

- : M283BSY0985

Transcript Status

: Final

Name

- : Kharisma Rizki Wijanarko

Bangkit Completion

: Full Graduate

University

University

- : Universitas Negeri Semarang

Learning Path

: Machine Learning

NIM

- : 4611421124

Capstone Team

: CH2-PS053

Supervisor

- : Abas Setiawan, S.Kom., M.Cs.

Capstone Status

: Finished

Courses/Specialization/Activities

Course Codes

Hours

Suggested SKS

Score (0-100)


```text

IT Automation with Python B23MLCR01 48 1 Data Analytics Structuring Machine Learning Projects B23MLCR06 B23MLCR04 95 6 2 Mathematics for Machine Learning and Data Science TensorFlow Developer Professional Certificate B23MLCR02 B23MLCR03 74 79 3 Machine Learning Specialization by Andrew Ng B23MLCR07 94 2 Tensorflow Data and Deployment B23MLCR05 51 1 Preparing for Tensorflow Developer Certification B23TDCP01 25 1 Capstone / Final Project B23CAPP01 200 5 Soft skill & Career Development B23SSCE01 249 5 Total (Hours, SKS) / Average (Score) 921 20 95.0 A 93.2 A 91.7 A 94.7 A 95.9 A 95.9 A 95.8 A 96.1 A 89.3 A 93.6 A 93.23 A

```


Score (A-E)

Student's Attendance (Mandatory Meeting)

100.00%

Student's Attendance (All Meeting)

97.62%

1. This is Bangkit-system-generated transcript and valid without signature

2. This Transcript acts as a recommendation. Final Decision on conversion is strictly Academic Counselor / Study Programme Prerogative.

# Grade conversion:

A : 85 - 100 | B : 75 - 84 | C : 60 - 74 | D : 50 - 59 | E : 0 - 49

In [39]:
def merge_label_value(lines: list[str]) -> list[str]:
    merged = []
    i = 0

    while i < len(lines):
        line = lines[i].strip()

        if (
            i + 1 < len(lines)
            and len(line) < 40
            and lines[i + 1].lstrip().startswith((":","- :"))
        ):
            value = lines[i + 1].replace("- :", ":").strip()
            merged.append(f"{line}{value}")
            i += 2
        else:
            merged.append(line)
            i += 1

    return merged

def dedupe_consecutive(lines: list[str]) -> list[str]:
    out = []
    prev = None
    for line in lines:
        if line != prev:
            out.append(line)
        prev = line
    return out

def clean_lines(md: str) -> list[str]:
    return [l.strip() for l in md.splitlines() if l.strip()]

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

def normalize_markdown(md: str) -> str:
    lines = clean_lines(md)
    lines = dedupe_consecutive(lines)
    lines = merge_label_value(lines)
    return "\n\n".join(lines)

In [40]:
normalized_text = normalize_markdown(text)

In [41]:
display(Markdown(normalized_text))

AN bangE!t

Bangkit 2023 Final Transcript

us Kor Geka D INDONESIA SAYA

Generated: 10 January 2024

Bangkit ID: M283BSY0985

Transcript Status: Final

Name: Kharisma Rizki Wijanarko

Bangkit Completion: Full Graduate

University: Universitas Negeri Semarang

Learning Path: Machine Learning

NIM: 4611421124

Capstone Team: CH2-PS053

Supervisor: Abas Setiawan, S.Kom., M.Cs.

Capstone Status: Finished

Courses/Specialization/Activities

Course Codes

Hours

Suggested SKS

Score (0-100)

```text

IT Automation with Python B23MLCR01 48 1 Data Analytics Structuring Machine Learning Projects B23MLCR06 B23MLCR04 95 6 2 Mathematics for Machine Learning and Data Science TensorFlow Developer Professional Certificate B23MLCR02 B23MLCR03 74 79 3 Machine Learning Specialization by Andrew Ng B23MLCR07 94 2 Tensorflow Data and Deployment B23MLCR05 51 1 Preparing for Tensorflow Developer Certification B23TDCP01 25 1 Capstone / Final Project B23CAPP01 200 5 Soft skill & Career Development B23SSCE01 249 5 Total (Hours, SKS) / Average (Score) 921 20 95.0 A 93.2 A 91.7 A 94.7 A 95.9 A 95.9 A 95.8 A 96.1 A 89.3 A 93.6 A 93.23 A

```

Score (A-E)

Student's Attendance (Mandatory Meeting)

100.00%

Student's Attendance (All Meeting)

97.62%

1. This is Bangkit-system-generated transcript and valid without signature

2. This Transcript acts as a recommendation. Final Decision on conversion is strictly Academic Counselor / Study Programme Prerogative.

# Grade conversion:

A : 85 - 100 | B : 75 - 84 | C : 60 - 74 | D : 50 - 59 | E : 0 - 49