# PDF to markdown 
pip install pypdf pymupdf

In [1]:
# pdf_to_markdown.py
import os, sys, pathlib
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [6]:
in_path = pathlib.Path("mypdf.pdf").expanduser().resolve()
out_path = pathlib.Path("mypdf.md").expanduser().resolve()
print(f"in_path: {in_path} out_path: {out_path}")

in_path: /home/santy/programming/python/langchain/mypdf.pdf out_path: /home/santy/programming/python/langchain/mypdf.md


In [7]:
    # 1) Load PDF -> LangChain Documents (page-level)
    loader = PyMuPDFLoader(str(in_path))
    docs = loader.load()  # one Document per page (metadata includes page number)

In [8]:
    # 2) Split into LLM-sized chunks (preserve some context)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,
        chunk_overlap=300,
        separators=["\n\n", "\n", " ", ""],
    )
    chunks = splitter.split_documents(docs)

In [9]:
    # 3) Define the Markdown conversion chain
    prompt = ChatPromptTemplate.from_messages([
        ("system",
         "You convert raw PDF-extracted text into clean, readable Markdown.\n"
         "Rules:\n"
         "- Keep headings (#, ##, ###) based on apparent structure.\n"
         "- Preserve lists, bullet points, and numbering.\n"
         "- Keep code blocks and inline code when obvious.\n"
         "- For tables, reconstruct using Markdown tables when possible; otherwise use list blocks.\n"
         "- Remove page headers/footers, page numbers, and junk artifacts.\n"
         "- Never invent content; only reformat what’s present."),
        ("user",
         "Source text (partial):\n\n{chunk}\n\n"
         "Return ONLY Markdown for that text, no preface, no explanations.")
    ])

In [10]:
from langchain_openai import ChatOpenAI
import os

# build your agent
llm = ChatOpenAI(
    model="gpt-4o-mini",            # Use a lightweight GPT-4 model
    temperature=0,                  # Deterministic output (no randomness)
    api_key=os.environ["OPEN_AI_SECRET_KEY"]  # Pull your OpenAI key from environment
)

In [11]:
chain = prompt | llm | StrOutputParser()

In [13]:
# 4) Convert each chunk → Markdown and merge
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
    for i, d in enumerate(chunks, start=1):
        md = chain.invoke({"chunk": d.page_content})
        f.write(md.rstrip() + "\n\n")  # spacer between chunks

print(f"Wrote Markdown to: {out_path}")

Wrote Markdown to: /home/santy/programming/python/langchain/mypdf.md
