## Extract text from PDFs

In [1]:
from pypdf import PdfReader
import re

def extract_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []

    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        text = re.sub(r"\s+", " ", text).strip()
        pages.append({
            "page": i + 1,
            "text": text
        })
    return pages


## Chunk text & process all the PDFs

In [2]:
from pathlib import Path
import json

RAW_DIR = Path("data/raw")
OUT_FILE = Path("data/processed/chunks.json")

def chunk_text(text, size=900, overlap=180):
    chunks = []
    start = 0

    while start < len(text):
        end = start + size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
        if start < 0:
            start = 0
        if end >= len(text):
            break

    return chunks

def process_all_pdfs():
    all_chunks = []
    chunk_id = 0

    for pdf_path in RAW_DIR.glob("*.pdf"):
        topic = pdf_path.stem.lower()   # abortion / hiv / infertility
        source = f"WHO {topic.capitalize()} Guideline"

        print(f"Processing {pdf_path.name}...")

        pages = extract_pdf(pdf_path)

        for p in pages:
            page_no = p["page"]
            for chunk in chunk_text(p["text"]):
                all_chunks.append({
                    "chunk_id": f"{topic}_{chunk_id}",
                    "text": chunk,
                    "source": source,
                    "topic": topic,
                    "page": page_no
                })
                chunk_id += 1

    return all_chunks

if __name__ == "__main__":
    chunks = process_all_pdfs()
    OUT_FILE.parent.mkdir(parents=True, exist_ok=True)

    with open(OUT_FILE, "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(chunks)} chunks → {OUT_FILE}")

Processing AbortionCare.pdf...
Processing AdvancedHIV.pdf...
Processing HIVService.pdf...
Processing Infertility.pdf...
Saved 2684 chunks → data\processed\chunks.json


In [3]:
import json
data = json.load(open("data/processed/chunks.json"))

print(len(data))
print(data[0]["text"][:300])

2684
Abortion care guideline, second edition
