In [3]:
!pip install markitdown[all]


Collecting azure-ai-documentintelligence (from markitdown[all])
  Downloading azure_ai_documentintelligence-1.0.2-py3-none-any.whl.metadata (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-identity (from markitdown[all])
  Downloading azure_identity-1.23.1-py3-none-any.whl.metadata (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.4/82.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting mammoth (from markitdown[all])
  Downloading mammoth-1.9.1-py2.py3-none-any.whl.metadata (24 kB)
Collecting olefile (from markitdown[all])
  Downloading olefile-0.47-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pdfminer-six (from markitdown[all])
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting python-pptx (from markitdown

In [4]:
# short chunk test.
import os
import re
from pathlib import Path
from markitdown import MarkItDown

# Cell 2: Setup Input/Output Paths
pdf_path = "/content/BAnz AT 02.04.2024 B3.pdf"
output_md_path = "/content/example.md"

# Cell 3: Convert PDF to Markdown with MarkItDown
converter = MarkItDown()
result = converter.convert(pdf_path)
markdown = result.markdown  # Extract markdown string

# Save to markdown file (optional)
with open(output_md_path, "w", encoding="utf-8") as f:
    f.write(markdown)

print("✅ Markdown conversion complete.")

# Cell 4: Paragraph-aware splitter
def split_paragraphwise(text, max_chars=2000):
    """
    Split a page into chunks at paragraph boundaries (not mid-sentence).
    """
    paragraphs = text.split("\n\n")
    chunks, current = [], ""

    for para in paragraphs:
        if len(current) + len(para) + 2 <= max_chars:
            current += "\n\n" + para
        else:
            if current.strip():
                chunks.append(current.strip())
            current = para
    if current.strip():
        chunks.append(current.strip())
    return chunks

# Cell 5: Chunking Markdown by Page, with paragraph-aware splits
def chunk_markdown_by_page(md_text, max_chars=2000):
    chunks = []
    pages = re.split(r"(<!-- Page: \d+ -->)", md_text)

    grouped_pages = []
    for i in range(0, len(pages), 2):
        page_marker = pages[i].strip()
        content = pages[i + 1].strip() if i + 1 < len(pages) else ""
        full_page = f"{page_marker}\n{content}".strip()
        grouped_pages.append(full_page)

    for page in grouped_pages:
        if len(page) <= max_chars:
            chunks.append(page.strip())
        else:
            subchunks = split_paragraphwise(page, max_chars=max_chars)
            chunks.extend(subchunks)

    return chunks

# Cell 6: Run Chunking
chunks = chunk_markdown_by_page(markdown, max_chars=2000)

# Print summary
print(f"✅ Total chunks created: {len(chunks)}")
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i + 1} ---\n{chunk[:500]}...")

# Cell 7: Save chunks to disk
output_dir = Path("/content/markdown_chunks")
output_dir.mkdir(exist_ok=True)

for i, chunk in enumerate(chunks):
    with open(output_dir / f"chunk_{i+1:03d}.md", "w", encoding="utf-8") as f:
        f.write(chunk)

print(f"✅ All chunks saved to: {output_dir}")


✅ Markdown conversion complete.
✅ Total chunks created: 20

--- Chunk 1 ---
Bekanntmachung
Veröffentlicht am Dienstag, 2. April 2024
BAnz AT 02.04.2024 B3
Seite 1 von 7

Bundesministerium
für Umwelt, Naturschutz, nukleare Sicherheit und Verbraucherschutz

Förderrichtlinie
für Maßnahmen der Künstlichen Intelligenz
„KI-Leuchttürme für den Natürlichen Klimaschutz“

Vom 19. März 2024

1   Förderziel, Zuwendungszweck, Rechtsgrundlage

1.1  Förderziel und Zuwendungszweck

Der Natürliche Klimaschutz spielt eine zentrale Rolle bei der Bewältigung der Klimakrise und ihrer Folgen...

--- Chunk 2 ---
Das Bundesministerium für Umwelt, Naturschutz, nukleare Sicherheit und Verbraucherschutz (BMUV) setzt sich mit
dem  5-Punkte-Programm  „Künstliche  Intelligenz  für  Umwelt  und  Klima“  für  eine  umweltpolitische  Gestaltung  von
Künstlicher  Intelligenz  ein.  Ein  maßgebliches  Instrument  zur  Erreichung  dieser  Ziele  ist  eine  gezielte  Innovations-
förderung  für  eine  Technologieentwicklu

In [None]:
# Cell 5B: Evaluate chunk quality heuristically

# 1. Average length
avg_len = sum(len(c) for c in chunks) / len(chunks)
print(f"\n📏 Average chunk length: {avg_len:.0f} characters")

# 2. Short chunk warning (<300 chars)
short_chunks = [i for i, c in enumerate(chunks) if len(c) < 300]
print(f"⚠️ Chunks under 300 characters: {len(short_chunks)} → Indexes: {short_chunks[:10]}")

# 3. Table integrity check (split tables)
possible_split_tables = [i for i, c in enumerate(chunks) if '|' in c and c.count('|') < 3]
print(f"🧩 Possibly split table chunks: {len(possible_split_tables)} → {possible_split_tables[:10]}")

# 4. Page boundary check
pages_detected = sum('<!-- Page:' in c for c in chunks)
print(f"📄 Page markers detected in chunks: {pages_detected}/{len(chunks)}")

# 5. Print one short chunk for inspection
if short_chunks:
    print("\n🕵️ Example short chunk:")
    print(f"\n--- Chunk {short_chunks[0]+1} ---\n{chunks[short_chunks[0]]}")

In [6]:
# Table chunk test
import os
import re
from pathlib import Path
from markitdown import MarkItDown

# Cell 2: Setup Input/Output Paths
pdf_path = "/content/bw_budget_02_01_Epl.pdf"
output_md_path = "/content/example.md"

# Cell 3: Convert PDF to Markdown with MarkItDown
converter = MarkItDown()
result = converter.convert(pdf_path)
markdown = result.markdown  # Extract markdown string

# Save to markdown file (optional)
with open(output_md_path, "w", encoding="utf-8") as f:
    f.write(markdown)

print("✅ Markdown conversion complete.")

# Cell 4: Table-safe paragraph-aware splitter
def split_paragraphwise_table_safe(text, max_chars=2000):
    """
    Split text into chunks at paragraph boundaries,
    but keep markdown tables intact (do not split inside tables).
    """
    lines = text.splitlines()
    chunks = []
    current_chunk = []

    def flush():
        if current_chunk:
            chunks.append("\n".join(current_chunk).strip())
            current_chunk.clear()

    in_table = False
    current_len = 0

    for line in lines:
        # Detect markdown table lines (heuristic)
        is_table_line = "|" in line and not line.strip().startswith("#")

        if is_table_line:
            in_table = True
        elif in_table and line.strip() == "":
            # Blank line ends a table block
            in_table = False

        line_len = len(line) + 1  # including newline
        # If adding this line exceeds max_chars and we are not inside a table, flush current chunk
        if current_len + line_len > max_chars and not in_table:
            flush()
            current_chunk.append(line)
            current_len = line_len
        else:
            current_chunk.append(line)
            current_len += line_len

    flush()
    return chunks

# Cell 5: Chunking Markdown by Page with table-safe paragraph splits
def chunk_markdown_by_page(md_text, max_chars=2000):
    chunks = []
    # Split by page markers (keep markers)
    pages = re.split(r"(<!-- Page: \d+ -->)", md_text)

    grouped_pages = []
    for i in range(0, len(pages), 2):
        page_marker = pages[i].strip()
        content = pages[i + 1].strip() if i + 1 < len(pages) else ""
        full_page = f"{page_marker}\n{content}".strip()
        grouped_pages.append(full_page)

    for page in grouped_pages:
        if len(page) <= max_chars:
            chunks.append(page.strip())
        else:
            # Use table-safe splitter to avoid breaking tables
            subchunks = split_paragraphwise_table_safe(page, max_chars=max_chars)
            chunks.extend(subchunks)

    return chunks

# Cell 6: Run Chunking
chunks = chunk_markdown_by_page(markdown, max_chars=2000)

# Print summary for first few chunks
print(f"✅ Total chunks created: {len(chunks)}")
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i + 1} ---\n{chunk[:500]}...")

# Cell 7: Save chunks to disk
output_dir = Path("/content/markdown_chunks")
output_dir.mkdir(exist_ok=True)

for i, chunk in enumerate(chunks):
    with open(output_dir / f"chunk_{i+1:03d}.md", "w", encoding="utf-8") as f:
        f.write(chunk)

print(f"✅ All chunks saved to: {output_dir}")


✅ Markdown conversion complete.
✅ Total chunks created: 66

--- Chunk 1 ---
Staatshaushaltsplan

für 2025/2026

Einzelplan 01

Landtag

Für den Druck wurde klimaneutral produziertes, weißes Papier verwendet


Inhalt

Betragsteil
Seite

Stellenteil
Seite

Vorwort .................................................................................................................................................................................................

Ziele und Kennzahlen (Produktorientierte Informationen) .................................................................

--- Chunk 2 ---
Die Aufgaben des Landtags sind in der Landesverfassung festgelegt. Artikel 27 Abs. 2 der Verfassung lautet: „Der Landtag übt die gesetzgebende Gewalt aus und über-
wacht die Ausübung der vollziehenden Gewalt nach Maßgabe dieser Verfassung.“

Das Kapitel 0101 enthält die Ausgaben für die Mitglieder des Landtags sowie die zu erwartenden Einnahmen und Ausgaben, die bei der Erfüllung der dem Landtag
obli