In [4]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Step 1: Convert PDF to image
images = convert_from_path("/workspaces/jaya/patient_10785.pdf")  # replace with your PDF file name
image = images[0]  # Assuming single-page PDF

# Step 2: Use Tesseract to extract text from the image
text = pytesseract.image_to_string(image)

print("🔍 Extracted Text:\n")
print(text)


🔍 Extracted Text:

Origin Hospital

 

Patient ID: 10785 Patient Age: 33 years
Patient Name: Nipzu GA: 43 weeks 1 day
Gender : Female BMI: 28

Examination Findings
Head : Normal skull apperance

Brain : No choroid plexus cyst seen

Heart : Normal 4 chamber view
Spine: No spina bifida
Abdominal wall: Normal
Urinary tract: Normal

Extremities: Hands and feet appear normal

Conclusion

There is no structural defects and normal flow patterns no fetal abnormalities detected in this scan



In [6]:
from pdf2image import convert_from_path
import pytesseract
import os
import json
import re

def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    image = images[0]  # Single-page assumption
    return pytesseract.image_to_string(image)

def parse_text(text):
    ga = re.search(r'GA:\s*(\d+)', text)
    age = re.search(r'Age:\s*(\d+)', text)
    bmi = re.search(r'BMI:\s*(\d+)', text)

    findings = []
    if "Examination Findings" in text:
        findings_section = text.split("Examination Findings", 1)[-1]
        conclusion_split = findings_section.split("Conclusion")
        lines = conclusion_split[0].strip().split("\n")
        findings += [line.split(":", 1)[-1].strip() if ":" in line else line.strip() for line in lines if line.strip()]
        if len(conclusion_split) > 1:
            findings.append(conclusion_split[1].strip())

    return {
        "patient_id": "anonymous-patient-ID",
        "gestaional_age": ga.group(1) if ga else "unknown",
        "demographic_age": age.group(1) if age else "unknown",
        "BMI": bmi.group(1) if bmi else "unknown",
        "examination_findings": findings
    }

def process_pdfs_in_folder(folder_path):
    result = {"dataResources": []}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print(f"📄 Processing {filename}")
            try:
                text = extract_text_from_pdf(pdf_path)
                data = parse_text(text)
                result["dataResources"].append(data)
            except Exception as e:
                print(f"⚠️ Failed to process {filename}: {e}")
    return result

# Run it
if __name__ == "__main__":
    folder_path = "/workspaces/jaya/pdfs"
    final_data = process_pdfs_in_folder(folder_path)
    with open("result.json", "w") as f:
        json.dump(final_data, f, indent=2)
    print("✅ Done! Output saved to result.json")


📄 Processing patient_30475.pdf
📄 Processing patient_87514.pdf
📄 Processing patient_51236.pdf
📄 Processing patient_38957.pdf
📄 Processing patient_76845.pdf
📄 Processing patient_23581.pdf
📄 Processing patient_67981.pdf
📄 Processing patient_48729.pdf
📄 Processing patient_34210.pdf
📄 Processing patient_90473.pdf
📄 Processing patient_87346.pdf
📄 Processing patient_21670.pdf
📄 Processing patient_46098.pdf
📄 Processing patient_15973.pdf
📄 Processing patient_78103.pdf
📄 Processing patient_93501.pdf
📄 Processing patient_72854.pdf
📄 Processing patient_95124.pdf
📄 Processing patient_68924.pdf
📄 Processing patient_25106.pdf
📄 Processing patient_19362.pdf
📄 Processing patient_74628.pdf
📄 Processing patient_94168.pdf
📄 Processing patient_59680.pdf
📄 Processing patient_63218.pdf
📄 Processing patient_14392.pdf
📄 Processing patient_82041.pdf
📄 Processing patient_63490.pdf
📄 Processing patient_18453.pdf
📄 Processing patient_10785.pdf
📄 Processing patient_40375.pdf
📄 Processing patient_43218.pdf
📄 Proces

In [None]:
import json

# Load your existing JSON file
with open("result.json") as f:
    data = json.load(f)

# Assign unique anonymized IDs
for i, entry in enumerate(data["dataResources"], start=1):
    entry["patient_id"] = f"patient_{i:03d}"

# Save updated JSON
with open("anonymized_data.json", "w") as f:
    json.dump(data, f, indent=2)

In [11]:
import pandas as pd
import json

# Load the JSON data (assuming you have saved it as a file)
with open("anonymized_data.json") as f:
    data = json.load(f)["dataResources"]

# Convert findings list into a single string for easy tabular representation
for patient in data:
    patient["examination_findings"] = "; ".join(patient["examination_findings"])

# Create DataFrame
df = pd.DataFrame(data)

df.head()

Unnamed: 0,patient_id,gestaional_age,demographic_age,BMI,examination_findings
0,patient_001,32,31,38,Normal skull apperance; No choroid plexus cyst...
1,patient_002,25,28,36,Normal skull apperance; No choroid plexus cyst...
2,patient_003,22,25,22,Normal skull apperance; No choroid plexus cyst...
3,patient_004,33,23,33,Normal skull apperance; No choroid plexus cyst...
4,patient_005,unknown,37,35,Normal skull apperance; Choroid plexus cyst se...


In [9]:
import matplotlib.pyplot as plt
import json

# Load data
with open("anonymized_data.json") as f:
    patients = json.load(f)["dataResources"]

def safe_int(value):
    try:
        return int(value)
    except ValueError:
        return None

ages = [safe_int(p["demographic_age"]) for p in patients]
gestational_ages = [safe_int(p["gestaional_age"]) for p in patients]
bmis = [safe_int(p["BMI"]) for p in patients]

# Remove None values
ages = [a for a in ages if a is not None]
gestational_ages = [g for g in gestational_ages if g is not None]
bmis = [b for b in bmis if b is not None]

# Plot histograms
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(ages, bins=8, color="skyblue", edgecolor="black")
plt.title("Patient Age Distribution")

plt.subplot(1, 3, 2)
plt.hist(gestational_ages, bins=8, color="lightgreen", edgecolor="black")
plt.title("Gestational Age Distribution")

plt.subplot(1, 3, 3)
plt.hist(bmis, bins=8, color="salmon", edgecolor="black")
plt.title("BMI Distribution")

plt.tight_layout()
plt.savefig("report_charts.png")
plt.close()
