Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python-markitdown/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Python MarkItDown: Convert Documents Into LLM-Ready Markdown

This folder provides the code examples for the Real Python tutorial [Python MarkItDown: Convert Documents Into LLM-Ready Markdown](https://realpython.com/python-markitdown/).
30 changes: 30 additions & 0 deletions python-markitdown/batch_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pathlib import Path

from markitdown import MarkItDown


def main(
input_dir,
output_dir="output",
target_formats=(".docx", ".xlsx", ".pdf"),
):
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

md = MarkItDown()

for file_path in input_path.glob("*"):
if file_path.suffix in target_formats:
try:
result = md.convert(file_path)
except Exception as e:
print(f"✗ Error converting {file_path.name}: {e}")

output_file = output_path / f"{file_path.stem}.md"
output_file.write_text(result.markdown, encoding="utf-8")
print(f"✓ Converted {file_path.name} → {output_file.name}")


if __name__ == "__main__":
main("data", "output")
5 changes: 5 additions & 0 deletions python-markitdown/convert_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from markitdown import MarkItDown

md = MarkItDown()
result = md.convert("./data/sample_DOCX.docx")
print(result)
Binary file added python-markitdown/data/pep8.docx
Binary file not shown.
Binary file added python-markitdown/data/real-python.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions python-markitdown/data/sample_CSV.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
First Name,Last Name,Department,Position,Start Date
Alice,Johnson,Marketing,Marketing Coordinator,1/15/2022
Bob,Williams,Human Resources,HR Generalist,6/1/2021
Carol,Davis,Engineering,Software Engineer,3/20/2023
David,Brown,Sales,Sales Representative,9/10/2022
Eve,Miller,Finance,Financial Analyst,11/5/2021
Frank,Garcia,Customer Service,Customer Support Specialist,7/1/2023
Grace,Rodriguez,Research & Development,Research Scientist,4/25/2022
Henry,Martinez,Operations,Operations Manager,2/14/2021
Binary file added python-markitdown/data/sample_DOCX.docx
Binary file not shown.
Binary file added python-markitdown/data/sample_PDF.pdf
Binary file not shown.
Binary file added python-markitdown/data/sample_XLSX.xlsx
Binary file not shown.
Binary file added python-markitdown/data/zen-of-python.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
15 changes: 15 additions & 0 deletions python-markitdown/img_description.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os

from markitdown import MarkItDown
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)
md = MarkItDown(
llm_client=client,
llm_model="gpt-4o",
)

result = md.convert("./data/real-python.png")
print(result.markdown)
16 changes: 16 additions & 0 deletions python-markitdown/ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os

from markitdown import MarkItDown
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)
md = MarkItDown(
llm_client=client,
llm_model="gpt-4o",
llm_prompt="Extract text from image with OCR and return Markdown.",
)

result = md.convert("./data/zen-of-python.png")
print(result.markdown)