In [None]:
import os
import csv
import re
import fitz  
import spacy
from pathlib import Path
from textwrap import wrap
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq

nlp = spacy.load("en_core_web_sm")

# Insert your path here
pdf_folder = Path("PATH\TO\YOUR\PDF\FOLDER")
output_folder = pdf_folder.parent / "csv_outputs"
os.makedirs(output_folder, exist_ok=True)

#llm setup
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
llm = ChatGroq(model_name="llama3-8b-8192")

# Prompt 
prompt = PromptTemplate(
    input_variables=['note'],
    template="""
You are a clinical document parser.  Your job is to take the full text of an Adult History & Physical PDF (attached) and output a CSV table suitable for Excel.  The CSV must have exactly these columns:

Date,Category,Section,Description

1. Date  
   - Use full calendar dates when given (e.g. "January 8 2018")  
   - Otherwise use relative descriptors ("Past 2 months", "3 days ago", "Ongoing", "Plan")  
   - If you are using a relative descriptor, identify the first explicit calendar date in the document, then subtract the interval. For example, “Past 2 months” from January 8 2018 becomes “November 8 2017”.

2. Category  
   Must be one of:  
   - Diagnosis: a formal medical condition or disease identified by a clinician  
   - MedicalHistory: the patient’s prior or chronic health conditions and relevant past events  
   - PhysicalExam: objective findings noted during the physical examination  
   - Medication: medications the patient is currently taking or has taken in the past  
   - Allergy: documented adverse reactions to drugs, foods, or environmental substances  
   - FamilyHistory: significant health conditions present in first degree relatives  
   - SocialHistory: lifestyle factors such as occupation, habits, living situation, and social supports  
   - LabResult: quantitative or qualitative findings from laboratory tests  
   - Imaging: interpreted results from radiologic or other imaging studies  
   - Procedure: medical or surgical interventions performed on the patient  
   - Event: key clinical events like hospital admissions, procedures, or symptom onset  

3. Section  
   The H&P section where the item appears (e.g. “History of Present Illness”, “Past Medical History”, “Physical Exam”, etc.)

4. Description  
   A concise but complete narrative of the finding. If multiple items share the same Date + Category, combine them into one row and separate individual findings with semicolons—ensuring all relevant details are captured.

Rules:  
- Do NOT include rows with blank or "N/A" fields  
- Time stamps must be clearly mentioned in the Date column  
- If the same timestamp applies to multiple items, expand the Description field rather than repeating rows  
- Output ONLY the CSV (no commentary or extra text)  

Full Document:
{note}
"""
)

# Extract relevant sections
def extract_sections(text):
    section_titles = [
        "Chief Complaint", "History of Present Illness", "Past Medical History",
        "Surgical History", "Medications", "Allergies", "Family History",
        "Social History", "Review of Systems", "Physical Exam",
        "Pertinent Diagnostic Tests", "Problem List", "Summary Statement",
        "Assessment and Plan"
    ]
    combined = ""
    for title in section_titles:
        pattern = rf"{title}\s*:\s*(.*?)(?=\n\n[A-Z][^\n]*:|\Z)"
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            combined += f"\n\n# {title}\n{match.group(1).strip()}"
    return combined

# Process each PDF and write corresponding CSV
for idx, pdf_path in enumerate(sorted(pdf_folder.glob("*.pdf")), start=1):
    print(f"Processing {pdf_path.name}")

    # Read PDF text
    doc = fitz.open(pdf_path)
    pdf_text = "\n".join(page.get_text() for page in doc)

    # Combine sections and chunk
    content = extract_sections(pdf_text)
    chunks = wrap(content, width=7000)

    all_rows = []
    for i, chunk in enumerate(chunks, start=1):
        print(f"  Chunk {i}/{len(chunks)}")
        resp = (prompt | llm).invoke({"note": chunk})
        lines = resp.content.strip().splitlines()
        valid = [
            line.strip() for line in lines
            if re.match(r'^".*",".*",".*",".*"$', line.strip())
        ]
        if not valid:
            continue
        header_line = valid[0]
        data_lines = valid[1:]
        if not all_rows:
            all_rows.append(header_line)  # add header once
        all_rows.extend(data_lines)

    csv_path = output_folder / f"{pdf_path.stem}.csv"
    with open(csv_path, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.writer(f_out)
        for row_idx, line in enumerate(all_rows):
            row = next(csv.reader([line]))
            if row_idx == 0:
                row.insert(0, "Patient Id")
            else:
                row.insert(0, f"{idx:03d}")
            writer.writerow(row)

    print(f" → Saved CSV: {csv_path}")


Processing dummy_hnp.pdf
  Chunk 1/2
  Chunk 2/2
 → Saved CSV: C:\Users\Lenovo\OneDrive\Desktop\Saher_project\pdfReportGeneration\csv_outputs\dummy_hnp.csv
Processing Sample-Adult-History-And-Physical-By-M2-Student.pdf
  Chunk 1/12
  Chunk 2/12
  Chunk 3/12
  Chunk 4/12
  Chunk 5/12
  Chunk 6/12
  Chunk 7/12
  Chunk 8/12
  Chunk 9/12
  Chunk 10/12
  Chunk 11/12
  Chunk 12/12
 → Saved CSV: C:\Users\Lenovo\OneDrive\Desktop\Saher_project\pdfReportGeneration\csv_outputs\Sample-Adult-History-And-Physical-By-M2-Student.csv
