In [1]:
!pip install PyMuPDF pillow openai pandas openpyxl langchain langchain-openai faiss-cpu

Collecting langchain-openai
  Downloading langchain_openai-0.3.32-py3-none-any.whl.metadata (2.4 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.32-py3-none-any.whl (74 kB)
Downloading tiktoken-0.11.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m7.3 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tiktoken, langchain-openai
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [langchain-openai]
[1A[2KSuccessfully installed langchain-openai-0.3.32 tiktoken-0.11.0


In [31]:
import os, io, time, base64, json, glob
import fitz  # PyMuPDF
from PIL import Image
import pandas as pd
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from pydantic import BaseModel, field_validator
from typing import Optional, Union
from openai import OpenAI


os.environ["OPENAI_API_KEY"] = ""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not set. Please set it before proceeding."

In [44]:
CARDS_DIRS = [
    "/home/rijul/Gitlaboratory/Context_Engineering_LLM/cards/abbr",    # dermatology_core.v1.json
    "/home/rijul/Gitlaboratory/Context_Engineering_LLM/cards/lexicon",    # meds_observed.v1.json
    "/home/rijul/Gitlaboratory/Context_Engineering_LLM/cards/policy",     # notation.v1.json, units.v1.json, date.v1.json
    "/home/rijul/Gitlaboratory/Context_Engineering_LLM/cards/range",  # scorad.json, labs.json, anthro.json 
]
# Optional: a field schema jsonl with one object per line
FIELD_SCHEMA_JSONL = "/home/rijul/Gitlaboratory/Context_Engineering_LLM/cards/field_cards.jsonl"  # ok if missing

VSTORE_DIR = "/home/rijul/Academic/Atopic Eczema/rag_faiss"
EMBED_MODEL = "text-embedding-3-large"   # or your allowed embedding model
MODEL_NAME  = "gpt-5-2025-08-07"

USE_PAGE_RENDER = True


In [61]:
Intish = Union[int, str, None]

class MedicalDataExtraction(BaseModel):
    Duration: Optional[str] = None
    Site_of_Onset: Optional[str] = None
    Mode_of_Spread: Optional[str] = None
    Symptoms: Optional[str] = None
    Treatment_History: Optional[str] = None
    Personal_History: Optional[str] = None
    Birth_Preterm_Postterm: Optional[str] = None
    Birth_Weight: Optional[str] = None
    Mile_Stones: Optional[str] = None
    Socio_Economic_Status: Optional[str] = None
    Vaccination: Optional[str] = None
    Number_of_Members_in_Household: Optional[int] = None
    Family_History: Optional[str] = None
    Family_Tree: Optional[str] = None
    Past_History: Optional[str] = None
    Similar_Ailments: Optional[str] = None
    Pulse_Examination: Optional[str] = None
    BP_Examination: Optional[str] = None
    Pallor_Examination: Optional[str] = None
    Cyanosis_Examination: Optional[str] = None
    Jaundice_Examination: Optional[str] = None
    Lymph_Nodes_Examination: Optional[str] = None
    Weight_Examination: Optional[str] = None
    Height_Examination: Optional[str] = None
    Chest_Systemic_Examination: Optional[str] = None
    CVS_Systemic_Examination: Optional[str] = None
    Abdomen_Systemic_Examination: Optional[str] = None
    CNS_Systemic_Examination: Optional[str] = None
    Musculoskeletal_Systemic_Examination: Optional[str] = None
    Face_Cutaneous_Examination: Optional[str] = None
    Extremities_Cutaneous_Examination: Optional[str] = None
    Palms_Cutaneous_Examination: Optional[str] = None
    Flexures_Cutaneous_Examination: Optional[str] = None
    Predominant_Site_Cutaneous_Examination: Optional[str] = None
    Hairs_Cutaneous_Examination: Optional[str] = None
    Muccous_Membrane_Cutaneous_Examination: Optional[str] = None
    Oral_Cutaneous_Examination: Optional[str] = None
    Genital_Cutaneous_Examination: Optional[str] = None
    Scalp_Cutaneous_Examination: Optional[str] = None
    Trunk_Cutaneous_Examination: Optional[str] = None
    Soles_Cutaneous_Examination: Optional[str] = None
    Nails_Cutaneous_Examination: Optional[str] = None
    Nail_Folds_Cutaneous_Examination: Optional[str] = None
    Nasal_Cutaneous_Examination: Optional[str] = None
    Peri_anal_Cutaneous_Examination: Optional[str] = None
    Extent_of_body_surface_area_involved: Optional[str] = None
    Erythema_Intensity_Score: Intish = None
    Edema_Intensity_Score: Intish = None
    Excoriations_Intensity_Score: Intish = None
    Oozing_Intensity_Score: Intish = None
    Dryness_Intensity_Score: Intish = None
    Lichenification_Intensity_Score: Intish = None
    Total_Intensity_Score: Intish = None
    Itchiness_Subjective_Score: Intish = None
    Sleeplessness_Subjective_Score: Intish = None
    Total_C_Subjective_Score: Intish = None
    Final_SCORAD: Intish = None
    Provisional_Diagnosis: Optional[str] = None
    Hb_Investigations: Optional[str] = None
    Na_Investigations: Optional[str] = None
    ASL_Investigations: Optional[str] = None
    TLC_Investigations: Optional[str] = None
    DLC_Investigations: Optional[str] = None
    ESR_Investigations: Optional[str] = None
    Platelet_Investigations: Optional[str] = None
    K_Investigations: Optional[str] = None
    Urea_Investigations: Optional[str] = None
    Creatinine_Investigations: Optional[str] = None
    ALT_Investigations: Optional[str] = None
    FBS_Investigations: Optional[str] = None
    Serum_Billirubin_Investigations: Optional[str] = None
    Serum_Proteins_Investigations: Optional[str] = None
    Serum_IgE: Optional[str] = None
    Ana_Investigations: Optional[str] = None
    G6PD_Investigations: Optional[str] = None
    Chest_Xray_Investigations: Optional[str] = None
    Urine_RE_ME: Optional[str] = None
    ECG_Investigations: Optional[str] = None
    Echo_Cardiography_Investigations: Optional[str] = None
    Biopsy_Histopathology: Optional[str] = None
    Immunofluorescence: Optional[str] = None
    Ultrasound: Optional[str] = None
    MRI: Optional[str] = None
    Other_Investigations: Optional[str] = None
    Final_Diagnosis: Optional[str] = None
    Treatment_Followup: Optional[str] = None
    Followup_2: Optional[str] = None
    Followup_3: Optional[str] = None
    Followup_4: Optional[str] = None
    Followup_5: Optional[str] = None

    @field_validator(
        "Erythema_Intensity_Score","Edema_Intensity_Score","Excoriations_Intensity_Score",
        "Oozing_Intensity_Score","Dryness_Intensity_Score","Lichenification_Intensity_Score",
        "Total_Intensity_Score","Itchiness_Subjective_Score","Sleeplessness_Subjective_Score",
        "Total_C_Subjective_Score","Final_SCORAD", mode="before"
    )
    def coerce_intish(cls, v):
        if v is None:
            return None
        if isinstance(v, int):
            return v
        s = str(v).strip()
        for sep in ["/", "-", "–", "—", "to"]:
            if sep in s:
                s = s.split(sep)[0].strip()
        try:
            return int(s)
        except Exception:
            return None

    @field_validator(
        "Birth_Preterm_Postterm", mode="before"
    )
    def normalize_birth_status(cls, v):
        if v is None:
            return None
        s = str(v).strip().lower()

        # coarse cleanup
        s = s.replace("-", " ").replace("_", " ").replace(".", " ").replace("post term", "postterm").replace("pre term","preterm")

        # synonyms
        synonyms = {
            "pt": "preterm", "pre": "preterm", "preterm": "preterm",
            "term": "term", "full term": "term", "fullterm": "term",
            "post": "postterm", "postterm": "postterm", "post term": "postterm", "post-term": "postterm"
        }
        for k, val in synonyms.items():
            if s == k or k in s.split():
                s = val
                break

        # hard map to final labels
        if "preterm" in s:
            return "Preterm"
        if s == "term":
            return "Term"
        if "postterm" in s:
            return "Postterm"

        # otherwise keep original (or return Not specified at your preference)
        return v


In [34]:
def encode_image(image: Image.Image) -> str:
    buf = io.BytesIO()
    image.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")

def render_pages_to_images(pdf_path: str, dpi: int = 240):
    imgs = []
    with fitz.open(pdf_path) as doc:
        for p in doc:
            pix = p.get_pixmap(dpi=dpi)
            imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
    return imgs

def extract_images_from_pdf(pdf_path: str):
    images = []
    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            img_list = page.get_images(full=True)
            for _, img in enumerate(img_list):
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                im = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                images.append(im)
    return images

In [51]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_json_file(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def flatten_json_to_text(data, card_id: str):
    lines = [f"[CARD_ID]: {card_id}"]

    def add_map(name, m):
        if isinstance(m, dict) and m:
            lines.append(f"{name}:")
            for k, v in list(m.items())[:200]:
                v_str = v if isinstance(v, str) else json.dumps(v, ensure_ascii=False)
                lines.append(f"  - {k} -> {v_str}")

    if isinstance(data, dict):
        for key in [
            "description", "policy", "rules", "ranges", "synonyms", "map",
            "frequency_map", "route_map", "dose_regex", "abbreviations",
            "units", "canonicalization", "examples", "fields"
        ]:
            if key in data:
                val = data[key]
                if isinstance(val, (str, int, float)):
                    lines.append(f"{key}: {val}")
                elif isinstance(val, dict):
                    add_map(key, val)
                elif isinstance(val, list):
                    lines.append(f"{key}:")
                    for item in val[:200]:
                        if isinstance(item, dict):
                            pair = ", ".join(f"{k}={item[k]}" for k in list(item.keys())[:4])
                            lines.append(f"  - {pair}")
                        else:
                            lines.append(f"  - {item}")
        for k, v in data.items():
            if isinstance(v, (str, int, float)) and k not in {"description","policy","rules"}:
                lines.append(f"{k}: {v}")

    elif isinstance(data, list):
        for i, item in enumerate(data[:300]):
            if isinstance(item, dict):
                pair = ", ".join(f"{k}={item[k]}" for k in list(item.keys())[:4])
                lines.append(f"- {pair}")
            else:
                lines.append(f"- {str(item)}")

    return "\n".join(lines)

def iter_card_docs():
    paths = []
    for d in CARDS_DIRS:
        if os.path.isdir(d):
            paths.extend(glob.glob(os.path.join(d, "**", "*.json"), recursive=True))
    for p in paths:
        base = os.path.basename(p)
        folder = os.path.basename(os.path.dirname(p))
        card_id = f"{folder}/{base}"
        try:
            data = load_json_file(p)
        except Exception as e:
            print(f"Skipping {p}: {e}")
            continue
        text = flatten_json_to_text(data, card_id)
        yield Document(page_content=text, metadata={"card_id": card_id, "source": p})

    if FIELD_SCHEMA_JSONL and os.path.exists(FIELD_SCHEMA_JSONL):
        with open(FIELD_SCHEMA_JSONL, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    obj = {"raw": line}
                card_id = f"schema/field_cards.jsonl#{i}"
                text = flatten_json_to_text(obj, card_id)
                yield Document(page_content=text, metadata={"card_id": card_id, "source": FIELD_SCHEMA_JSONL})

def build_or_load_vectorstore():
    emb = OpenAIEmbeddings(model=EMBED_MODEL)  # picks up OPENAI_API_KEY from env
    if os.path.exists(VSTORE_DIR) and os.path.isdir(VSTORE_DIR):
        try:
            return FAISS.load_local(VSTORE_DIR, emb, allow_dangerous_deserialization=True)
        except Exception:
            pass  # rebuild if corrupted

    docs = list(iter_card_docs())
    if not docs:
        raise RuntimeError("No RAG cards found. Check CARDS_DIRS paths.")

    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
    chunks = splitter.split_documents(docs)

    vstore = FAISS.from_documents(chunks, emb)
    vstore.save_local(VSTORE_DIR)
    return vstore

# Build once
vstore = build_or_load_vectorstore()
retriever = vstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
print("RAG vector store ready.")

RAG vector store ready.


In [55]:
SCHEMA_KEYS = list(MedicalDataExtraction.model_fields.keys())

def schema_keys_block():
    return (
        "SCHEMA KEYS (use EXACT names; fill string values or integers; "
        "if absent/illegible use 'Not specified'):\n- " + "\n- ".join(SCHEMA_KEYS)
    )

In [62]:
def birth_policy_block():
    return (
        "BIRTH FIELD POLICY:\n"
        "- The form shows: 'Birth: Pre term / Term / Post term'. Clinician usually CIRCLES one.\n"
        "- Choose exactly one of {Preterm, Term, Postterm} based on visual cues (circle, tick, underline, emphasis).\n"
        "- Normalize variants: 'pre term', 'pre-term', 'PT' -> Preterm; 'post term', 'post-term', 'post' -> Postterm.\n"
        "- If no clear mark, return 'Not specified'."
    )

In [36]:
def make_rag_prefix(query: str, snippets):
    joined = "\n\n---\n\n".join([d.page_content for d in snippets])
    return (
        "USE THE FOLLOWING CARDS TO INTERPRET HANDWRITTEN CLINICAL NOTES. "
        "Follow policies for abbreviations, dates, units, medication notation, SCORAD, labs, and ranges. "
        "If a field is absent or illegible, set it to 'Not specified'. "
        "Return ONLY the fields defined by the schema keys (exact names).\n\n"
        f"{joined}\n\nEND OF CARDS.\n"
    )

In [63]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8), retry=retry_if_exception_type(Exception))
def _call_model(message_content):
    # 1) Preferred: structured parse path (Pydantic)
    try:
        resp = client.beta.chat.completions.parse(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": message_content}],
            response_format=MedicalDataExtraction,
            # NOTE: no temperature parameter (model enforces default)
        )
        return resp.choices[0].message.parsed
    except Exception as e_parse:
        print("[Parse path failed]", e_parse)

    # 2) Fallback: JSON mode
    try:
        resp = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{
                "role": "user",
                "content": message_content + [
                    {"type": "text", "text": "Return a single JSON object strictly matching the target schema."}
                ]
            }],
            response_format={"type": "json_object"},
            # NOTE: no temperature parameter
        )
        data = _json_loads(resp.choices[0].message.content)
        return MedicalDataExtraction(**data)
    except Exception as e_json:
        print("[JSON mode failed]", e_json)

    # 3) Last-resort: plain text response we parse as JSON
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{
            "role": "user",
            "content": message_content + [
                {"type": "text", "text": "Respond ONLY with a JSON object matching the target schema (no prose)."}
            ]
        }],
        # NOTE: no response_format, no temperature
    )
    txt = resp.choices[0].message.content.strip()

    # Try to extract a JSON object even if the model wrapped it in prose or backticks
    try:
        data = _json_loads(txt)
    except Exception:
        s, e = txt.find("{"), txt.rfind("}")
        if s != -1 and e != -1 and e > s:
            data = _json_loads(txt[s:e+1])
        else:
            raise RuntimeError("Could not parse model output as JSON.")
    return MedicalDataExtraction(**data)
def extract_structured_data_from_images(images):
    # RAG retrieval (static query; you can customize per PDF if desired)
    query = (
        "Dermatology intake for atopic eczema; meds parsing, labs synonyms & units, "
        "SCORAD rules, date normalization; strict schema extraction."
    )
    snippets = retriever.invoke(query)
    rag_text = make_rag_prefix(query, snippets)
    rag_cards = [d.metadata.get("card_id") for d in snippets]

    base64_images = [encode_image(im) for im in images]
    message_content = [
        {"type": "text", "text": schema_keys_block()},    # exact keys visible to the model
        {"type": "text", "text": birth_policy_block()},   # <-- NEW: circled-choice instruction
        {"type": "text", "text": rag_text},               # pinned + retrieved cards
        {"type": "text", "text":
            ("Act as a clinician. Extract ONLY the schema keys (exact names). "
             "If any field is absent or illegible, write 'Not specified'. "
             "Do not invent values. Do not add extra keys.")
        }
    ]
# ... then append the images ...

    for b64 in base64_images:
        message_content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{b64}"}
        })

    parsed = _call_model(message_content)
    return parsed, rag_cards

In [65]:
structured_data = []
i = 0

for pdf_file in sorted(os.listdir(pdf_folder)):
    if not pdf_file.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(pdf_folder, pdf_file)
    i += 1
    print(f"[{i}] Extracting: {pdf_file}")
    start_time = time.time()
    try:
        images = render_pages_to_images(pdf_path, dpi=240) if USE_PAGE_RENDER else extract_images_from_pdf(pdf_path)
        parsed, rag_cards = extract_structured_data_from_images(images)
        elapsed = time.time() - start_time

        try:
            row = parsed.model_dump()
        except Exception:
            row = parsed.dict() if hasattr(parsed, "dict") else parsed.__dict__

        row_out = {
            "Patient_ID": os.path.splitext(pdf_file)[0],
            "Extraction_Time": elapsed,
            **row,
            "rag_cards": ";".join(rag_cards)
        }
        structured_data.append(row_out)
    except Exception as e:
        print(f"   !! Failed on {pdf_file}: {e}")
        structured_data.append({
            "Patient_ID": os.path.splitext(pdf_file)[0],
            "Extraction_Time": time.time() - start_time,
            "error": str(e)
        })


# Build the canonical column order from your Pydantic schema
try:
    SCHEMA_KEYS = list(MedicalDataExtraction.model_fields.keys())   # Pydantic v2
except Exception:
    SCHEMA_KEYS = [f for f in dir(MedicalDataExtraction) if not f.startswith("_")]

ordered_cols = (
    ["Patient_ID", "Extraction_Time"]
    + SCHEMA_KEYS
    + (["rag_cards"] if "rag_cards" in df.columns else [])
    + (["error"] if "error" in df.columns else [])
)

# Ensure all columns exist; add missing with NaN
for c in ordered_cols:
    if c not in df.columns:
        df[c] = pd.NA

# Reindex to the exact schema order
df = df.reindex(columns=ordered_cols)

df = pd.DataFrame(structured_data)
df.to_excel(output_excel, index=False)
print(f"Data extraction complete. Results saved to {output_excel}.")

[1] Extracting: 1050.pdf
[2] Extracting: 1619.pdf
[3] Extracting: 2507.pdf
[4] Extracting: 2664.pdf
[5] Extracting: 2745.pdf
[6] Extracting: 2799.pdf
[7] Extracting: 2823.pdf
[8] Extracting: 3226.pdf
[9] Extracting: 3398.pdf
[10] Extracting: 3460.pdf
[11] Extracting: 3622.pdf
[12] Extracting: 3731.pdf
[13] Extracting: 3792.pdf
[14] Extracting: 3850.pdf
[15] Extracting: 3855.pdf
[16] Extracting: 3867.pdf
[17] Extracting: 3873.pdf
[18] Extracting: 3886.pdf
[19] Extracting: 3903.pdf
[20] Extracting: 716.pdf
Data extraction complete. Results saved to /home/rijul/Academic/Atopic Eczema/gpt5output_rag.xlsx.
