Task:  PDF to Structured format
by Prof He

In [16]:
#  Step 1: Installing necessary dependencies

# !pip install groq langchain-core langchain-community langchain-openai spacy matplotlib seaborn PyMuPDF langchain_groq
# !python -m spacy download en_core_web_sm

In [17]:
# Step 2: Import Libraries

import pandas as pd
import spacy
import re
import fitz  # PyMuPDF for PDF parsing
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from pathlib import Path
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain_groq import ChatGroq
from io import StringIO

In [18]:
# Step 3: Load NLP Model
nlp = spacy.load("en_core_web_sm")

In [19]:
# Step 4: Extract Text from PDF


# Set folder path where all your PDFs are
pdf_folder = Path("C:\\Users\\Lenovo\\OneDrive\\Desktop\\Saher_project\\pdfReportGeneration\\pdfs")  

pdf_files = list(pdf_folder.glob("*.pdf"))

# Loop over all PDFs
for pdf_path in pdf_files:
    print(f"\n📄 Processing: {pdf_path.name}")

    try:
        doc = fitz.open(pdf_path)
        pdf_text = "\n".join(page.get_text() for page in doc)

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")


📄 Processing: dummy_hnp.pdf

📄 Processing: Sample-Adult-History-And-Physical-By-M2-Student.pdf


In [20]:
# Step 5: Preprocess and Analyze

print("\n--- Named Entities Extracted ---")
spacy_doc = nlp(pdf_text)
for ent in spacy_doc.ents:
    print(f"{ent.text} ({ent.label_})")


--- Named Entities Extracted ---
M2 Student (PERSON)
Day 2 (DATE)
Patient (PERSON)
48 year-old (DATE)
Hispanic (NORP)
2-month (DATE)
Rheumatoid Arthritis (ORG)
4 months ago (DATE)
November 2017 (DATE)
Rheumatoid Arthritis (ORG)
2 weeks (DATE)
the past two months (DATE)
2 
months ago (DATE)
about half (CARDINAL)
the last month (DATE)
almost daily (DATE)
the day (DATE)
NSAIDS (ORG)
the last two months (DATE)
the last month (DATE)
night (TIME)
almost every night (TIME)
more 
than (CARDINAL)
4 consecutive hours (TIME)
one month (DATE)
Three days ago (DATE)
that day (DATE)
last three days (DATE)
10/10 (CARDINAL)
the past 3 days (DATE)
two days (DATE)
H&P (ORG)
2020 (DATE)
the end of COP‐2 (DATE)
COP (ORG)
the past four months (DATE)
36 pound (QUANTITY)
January 2018 (DATE)
2 weeks (DATE)
seasonal (DATE)
72 (CARDINAL)
44 (DATE)
40 (DATE)
2 (CARDINAL)
27 and (DATE)
24 (CARDINAL)
25 years (DATE)
2 (CARDINAL)
daily (DATE)
Patient (PERSON)
2 (CARDINAL)
1 (CARDINAL)
4 months ago (DATE)
every 
day

In [21]:
# Step 6: Extract All Recognized Sections

section_titles = [
    "Chief Complaint", "History of Present Illness", "Past Medical History",
    "Surgical History", "Medications", "Allergies", "Family History",
    "Social History", "Review of Systems", "Physical Exam",
    "Pertinent Diagnostic Tests", "Problem List", "Summary Statement",
    "Assessment and Plan"
]

def extract_section(text, section_name):
    pattern = rf"{section_name}\s*:\s*(.*?)(?=\n\n[A-Z][^\n]*:|\n[A-Z ]{{4,}}\n|\Z)"
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return match.group(1).strip() if match else ""

combined_content = ""
for title in section_titles:
    extracted = extract_section(pdf_text, title)
    if extracted:
        combined_content += f"\n\n# {title}\n{extracted.strip()}"

print("\n--- Combined Extracted Content (Preview) ---\n")
print(combined_content[:1000])  # preview only


--- Combined Extracted Content (Preview) ---



# Chief Complaint
“I got lightheadedness and felt too weak to walk” 
Source and Setting: Patient reported in an in-patient setting on Day 2 of his hospitalization.   
History of Present Illness:  Patient is a 48 year-old well-nourished Hispanic male with a 2-month

# History of Present Illness
Patient is a 48 year-old well-nourished Hispanic male with a 2-month

# Past Medical History
- Rheumatoid Arthritis, diagnosed January 2018. Patient was diagnosed when he presented to the 
emergency room with joint pain in the hands and knees.  He was treated with corticosteroids and 
methotrexate.  The patient reported that the corticosteroids helped his symptoms significantly.  He 
only continued on the methotrexate for 2 weeks, as he did not feel it helped with his symptoms.  
- Up to date with vaccinations, including yearly influenza vaccine  
Surgical History:  
-Nasal artery cauterization and clip placement - 2011 
Medications: 
-  Ibuprofen 

In [22]:
print("\n--- Combined Extracted Content (Preview) ---\n")
print(combined_content) 


--- Combined Extracted Content (Preview) ---



# Chief Complaint
“I got lightheadedness and felt too weak to walk” 
Source and Setting: Patient reported in an in-patient setting on Day 2 of his hospitalization.   
History of Present Illness:  Patient is a 48 year-old well-nourished Hispanic male with a 2-month

# History of Present Illness
Patient is a 48 year-old well-nourished Hispanic male with a 2-month

# Past Medical History
- Rheumatoid Arthritis, diagnosed January 2018. Patient was diagnosed when he presented to the 
emergency room with joint pain in the hands and knees.  He was treated with corticosteroids and 
methotrexate.  The patient reported that the corticosteroids helped his symptoms significantly.  He 
only continued on the methotrexate for 2 weeks, as he did not feel it helped with his symptoms.  
- Up to date with vaccinations, including yearly influenza vaccine  
Surgical History:  
-Nasal artery cauterization and clip placement - 2011 
Medications: 
-  Ibuprofen 

In [23]:
# Step 7: Split into manageable chunks for Groq (max ~6000 tokens)

from textwrap import wrap
chunks = wrap(combined_content, width=7000)  # safe buffer for Groq limits

from getpass import getpass
os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(model_name="llama3-8b-8192")

prompt = PromptTemplate(
input_variables=['note'],
template="""
You are a clinical document parser.  Your job is to take the full text of an Adult History & Physical PDF (attached) and output a CSV table suitable for Excel.  The CSV must have exactly these columns:

Date,Category,Section,Description

1. Date  
   - Use full calendar dates when given (e.g. "January 8 2018")  
   - Otherwise use relative descriptors ("Past 2 months", "3 days ago", "Ongoing", "Plan")  
   - If you are using a relative descriptor, identify the first explicit calendar date in the document, then subtract the interval. For example, “Past 2 months” from January 8 2018 becomes “November 8 2017”.

2. Category  
   Must be one of:  
   - Diagnosis: a formal medical condition or disease identified by a clinician  
   - MedicalHistory: the patient’s prior or chronic health conditions and relevant past events  
   - PhysicalExam: objective findings noted during the physical examination  
   - Medication: medications the patient is currently taking or has taken in the past  
   - Allergy: documented adverse reactions to drugs, foods, or environmental substances  
   - FamilyHistory: significant health conditions present in first‑degree relatives  
   - SocialHistory: lifestyle factors such as occupation, habits, living situation, and social supports  
   - LabResult: quantitative or qualitative findings from laboratory tests  
   - Imaging: interpreted results from radiologic or other imaging studies  
   - Procedure: medical or surgical interventions performed on the patient  
   - Event: key clinical events like hospital admissions, procedures, or symptom onset  

3. Section  
   The H&P section where the item appears (e.g. “History of Present Illness”, “Past Medical History”, “Physical Exam”, etc.)

4. Description  
   A concise but complete narrative of the finding. If multiple items share the same Date + Category, combine them into one row and separate individual findings with semicolons—ensuring all relevant details are captured.

Rules:  
- Do NOT include rows with blank or "N/A" fields  
- Time stamps must be clearly mentioned in the Date column  
- If the same timestamp applies to multiple items, expand the Description field rather than repeating rows  
- Output ONLY the CSV (no commentary or extra text)  

Full Document:
{note}
"""
)

all_cleaned_rows = []

for i, chunk in enumerate(chunks):
        print(f"\n--- Sending Chunk {i+1}/{len(chunks)} to Groq ---")
        chain = prompt | llm
        response = chain.invoke({"note": chunk})
        summary_csv = response.content

        csv_lines = summary_csv.strip().splitlines()
        clean_lines = [line for line in csv_lines if line.count(",") == 2 and 'N/A' not in line and not line.lower().startswith("here is")]
        all_cleaned_rows.extend(clean_lines[1:] if clean_lines and clean_lines[0].lower().startswith("category") else clean_lines)



--- Sending Chunk 1/5 to Groq ---

--- Sending Chunk 2/5 to Groq ---

--- Sending Chunk 3/5 to Groq ---

--- Sending Chunk 4/5 to Groq ---

--- Sending Chunk 5/5 to Groq ---


In [24]:
csv_lines

['Here is the output CSV table:',
 '',
 '"Date","Category","Section","Description"',
 '"January 8 2018","Diagnosis","History of Present Illness","Positive for fatigue, lightheadedness, headaches, enlarged non-tender lymph nodes."',
 '"January 8 2018","MedicalHistory","History of Present Illness","Positive for fatigue, lightheadedness, headaches, enlarged non-tender lymph nodes."',
 '"January 8 2018","LabResult","Pertinent Diagnostic Tests","Positive ANA (>1:640)"',
 '"January 8 2018","LabResult","Pertinent Diagnostic Tests","Positive Rheumatoid Factor (70, Normal <20)"',
 '"Past 2 months","MedicalHistory","Past Medical History","2 month history of Rheumatoid Arthritis"',
 '"Past 2 months","MedicalHistory","Past Medical History","Strong family history of autoimmune disease"',
 '"Past 2 months","MedicalHistory","Past Medical History","2 month history of joint pain and weakness"',
 '"Past month","MedicalHistory","Past Medical History","1 month history of subjective fever, and night sweats

In [25]:
import csv

# Filter out any lines that aren't actual CSV content (e.g. headers or blank lines)
csv_data = [line for line in csv_lines if line.strip().startswith('"')]

# Write to a CSV file
with open("ans.csv", mode="w", newline='', encoding="utf-8") as file:
    writer = csv.writer(file)

    for i, line in enumerate(csv_data):
        row = list(eval(line))
        if i == 0:
            # It's the header — add "Patient Id" at the beginning
            row.insert(0, "Patient Id")
        else:
            # Add "001" for all data rows
            row.insert(0, "001")
        writer.writerow(row)

