In [7]:
import pandas as pd
import os

main_folder = 'MIMIC -III (10000 patients)'
no_suffix_folders = [
    'D_ICD_DIAGNOSES', 'D_ICD_PROCEDURES', 'D_ITEMS', 
    'CHARTEVENTS', 'D_LABITEMS', 'D_CPT'
]
file_suffix = '_random.csv'
all_folders = [
    'ADMISSIONS', 'PATIENTS', 'ICUSTAYS', 'TRANSFERS', 'SERVICES', 
    'D_ICD_DIAGNOSES', 'DIAGNOSES_ICD', 'D_ICD_PROCEDURES', 'PROCEDURES_ICD',
    'D_ITEMS', 'CHARTEVENTS', 'D_LABITEMS', 'LABEVENTS', 'PRESCRIPTIONS',
    'INPUTEVENTS_CV', 'INPUTEVENTS_MV', 'OUTPUTEVENTS', 'PROCEDUREEVENTS_MV',
    'MICROBIOLOGYEVENTS', 'NOTEEVENTS', 'D_CPT', 'CPTEVENTS', 'DRGCODES'
]
large_folders = ['CHARTEVENTS', 'LABEVENTS', 'NOTEEVENTS']


chunk_size = 1000000
dataframes = {}
print(f"Starting hybrid load from base folder: '{main_folder}'\n")
for folder in all_folders:
    try:
        if folder in no_suffix_folders:
            filename = f"{folder}.csv"
        else:
            filename = f"{folder}{file_suffix}"
            
        path = os.path.join(main_folder, folder, filename)
        key_name = f"df_{folder.lower()}"
        
        if folder in large_folders:
            print(f"⚙️  Loading '{filename}' in chunks...")
            chunk_iterator = pd.read_csv(path, chunksize=chunk_size, low_memory=False)
            chunk_list = [chunk for chunk in chunk_iterator]
            df = pd.concat(chunk_list, ignore_index=True)
            print(f"✅ Success: Finished loading all chunks for '{filename}'")
        else:
            df = pd.read_csv(path, low_memory=False)
            print(f"✅ Success: Loaded '{filename}'")

        dataframes[key_name] = df
        
    except FileNotFoundError:
        print(f"❌ SKIPPING: Could not find '{filename}' inside folder '{folder}'.")
    except Exception as e:
        print(f"An error occurred with folder '{folder}': {e}")
print("\n--- Load Summary ---")
print(f"Successfully loaded {len(dataframes)} out of {len(all_folders)} targeted folders.")
if 'df_patients' in dataframes:
    print("\nPreview of the Patients DataFrame:")
    display(dataframes['df_patients'].head())
df_notes = dataframes['df_noteevents']
first_note_text = df_notes.iloc[0]['TEXT']
print(first_note_text)
# Check if the 'dataframes' dictionary exists and has data in it
if 'dataframes' in locals() and dataframes:
    print(f"--- Displaying the first 4 rows of all {len(dataframes)} loaded DataFrames ---\n")
    
    # Loop through each key-value pair in the dictionary
    # The 'name' will be something like 'df_patients'
    # The 'df' will be the actual DataFrame object
    for name, df in dataframes.items():
        print("--------------------------------------------------")
        print(f"DataFrame: {name}")
        print("--------------------------------------------------")
        
        # Use display() to print the DataFrame in a nice format
        # .head(4) gets the first 4 rows
        display(df.head(4))
        print("\n") # Add a little space between tables

else:
    print("The 'dataframes' dictionary is empty or does not exist.")
    print("Please make sure you have run the data loading script successfully first.")



Starting hybrid load from base folder: 'MIMIC -III (10000 patients)'

✅ Success: Loaded 'ADMISSIONS_random.csv'
✅ Success: Loaded 'PATIENTS_random.csv'
✅ Success: Loaded 'ICUSTAYS_random.csv'
✅ Success: Loaded 'TRANSFERS_random.csv'
✅ Success: Loaded 'SERVICES_random.csv'
✅ Success: Loaded 'D_ICD_DIAGNOSES.csv'
✅ Success: Loaded 'DIAGNOSES_ICD_random.csv'
✅ Success: Loaded 'D_ICD_PROCEDURES.csv'
✅ Success: Loaded 'PROCEDURES_ICD_random.csv'
✅ Success: Loaded 'D_ITEMS.csv'
⚙️  Loading 'CHARTEVENTS.csv' in chunks...
❌ SKIPPING: Could not find 'CHARTEVENTS.csv' inside folder 'CHARTEVENTS'.
✅ Success: Loaded 'D_LABITEMS.csv'
⚙️  Loading 'LABEVENTS_random.csv' in chunks...
✅ Success: Finished loading all chunks for 'LABEVENTS_random.csv'
✅ Success: Loaded 'PRESCRIPTIONS_random.csv'
✅ Success: Loaded 'INPUTEVENTS_CV_random.csv'
✅ Success: Loaded 'INPUTEVENTS_MV_random.csv'
✅ Success: Loaded 'OUTPUTEVENTS_random.csv'
✅ Success: Loaded 'PROCEDUREEVENTS_MV_random.csv'
✅ Success: Loaded 'MICROBI

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,8455,8934,F,2032-12-13 00:00:00,2108-12-30 00:00:00,2108-12-30 00:00:00,2108-12-30 00:00:00,1
1,13889,14685,F,1845-09-04 00:00:00,2145-09-07 00:00:00,2145-09-07 00:00:00,2145-09-07 00:00:00,1
2,15732,16630,M,2029-02-22 00:00:00,2108-05-11 00:00:00,,2108-05-11 00:00:00,1
3,18273,19348,M,1886-10-08 00:00:00,2186-11-01 00:00:00,,2186-11-01 00:00:00,1
4,17506,18524,M,2092-10-18 00:00:00,2160-02-13 00:00:00,,2160-02-13 00:00:00,1


Admission Date:  [**2118-6-2**]       Discharge Date:  [**2118-6-14**]

Date of Birth:                    Sex:  F

Service:  MICU and then to [**Doctor Last Name **] Medicine

HISTORY OF PRESENT ILLNESS:  This is an 81-year-old female
with a history of emphysema (not on home O2), who presents
with three days of shortness of breath thought by her primary
care doctor to be a COPD flare.  Two days prior to admission,
she was started on a prednisone taper and one day prior to
admission she required oxygen at home in order to maintain
oxygen saturation greater than 90%.  She has also been on
levofloxacin and nebulizers, and was not getting better, and
presented to the [**Hospital1 18**] Emergency Room.

In the [**Hospital3 **] Emergency Room, her oxygen saturation was
100% on CPAP.  She was not able to be weaned off of this
despite nebulizer treatment and Solu-Medrol 125 mg IV x2.

Review of systems is negative for the following:  Fevers,
chills, nausea, vomiting, night sweats, change in we

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
1,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
2,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,,,V-TACH,0,1
3,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,UNKNOWN/NOT SPECIFIED,,,UNSTABLE ANGINA\CATH,0,1




--------------------------------------------------
DataFrame: df_patients
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,8455,8934,F,2032-12-13 00:00:00,2108-12-30 00:00:00,2108-12-30 00:00:00,2108-12-30 00:00:00,1
1,13889,14685,F,1845-09-04 00:00:00,2145-09-07 00:00:00,2145-09-07 00:00:00,2145-09-07 00:00:00,1
2,15732,16630,M,2029-02-22 00:00:00,2108-05-11 00:00:00,,2108-05-11 00:00:00,1
3,18273,19348,M,1886-10-08 00:00:00,2186-11-01 00:00:00,,2186-11-01 00:00:00,1




--------------------------------------------------
DataFrame: df_icustays
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
1,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202
2,372,275,129886,219649,carevue,CCU,CCU,7,7,2170-10-07 11:28:53,2170-10-14 14:38:07,7.1314
3,376,280,123506,247496,carevue,NICU,NICU,56,56,2155-12-08 18:22:09,2155-12-22 17:10:45,13.9504




--------------------------------------------------
DataFrame: df_transfers
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,EVENTTYPE,PREV_CAREUNIT,CURR_CAREUNIT,PREV_WARDID,CURR_WARDID,INTIME,OUTTIME,LOS
0,666,112,174105,289222.0,carevue,admit,,MICU,,12.0,2194-06-13 18:41:27,2194-06-14 14:51:17,20.16
1,667,112,174105,,carevue,transfer,MICU,,12.0,3.0,2194-06-14 14:51:17,2194-06-14 15:12:56,0.36
2,668,112,174105,,carevue,transfer,,,3.0,17.0,2194-06-14 15:12:56,2194-06-18 16:57:29,97.74
3,669,112,174105,,carevue,discharge,,,17.0,,2194-06-18 16:57:29,,




--------------------------------------------------
DataFrame: df_services
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,TRANSFERTIME,PREV_SERVICE,CURR_SERVICE
0,760,472,173064,2172-09-28 19:22:15,,CMED
1,764,475,139351,2131-09-16 18:44:04,,NB
2,767,478,137370,2194-07-15 13:55:21,,NB
3,772,482,145066,2184-12-31 02:29:00,,CMED




--------------------------------------------------
DataFrame: df_d_icd_diagnoses
--------------------------------------------------


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."




--------------------------------------------------
DataFrame: df_diagnoses_icd
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1488,112,174105,1.0,53100
1,1489,112,174105,2.0,41071
2,1490,112,174105,3.0,2859
3,1491,112,174105,4.0,41401




--------------------------------------------------
DataFrame: df_d_icd_procedures
--------------------------------------------------


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,264,851,Canthotomy,Canthotomy
1,265,852,Blepharorrhaphy,Blepharorrhaphy
2,266,859,Adjust lid position NEC,Other adjustment of lid position
3,267,861,Lid reconst w skin graft,Reconstruction of eyelid with skin flap or graft




--------------------------------------------------
DataFrame: df_procedures_icd
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,945,2592,130856,1,9671
1,946,2592,130856,2,3893
2,951,28600,189217,1,3613
3,952,28600,189217,2,3615




--------------------------------------------------
DataFrame: df_d_items
--------------------------------------------------


Unnamed: 0,ROW_ID,ITEMID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
0,457,497,Patient controlled analgesia (PCA) [Inject],,carevue,chartevents,,,,
1,458,498,PCA Lockout (Min),,carevue,chartevents,,,,
2,459,499,PCA Medication,,carevue,chartevents,,,,
3,460,500,PCA Total Dose,,carevue,chartevents,,,,




--------------------------------------------------
DataFrame: df_d_labitems
--------------------------------------------------


Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6




--------------------------------------------------
DataFrame: df_labevents
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,1,2,163353.0,51143,2138-07-17 20:48:00,0,0.0,%,
1,2,2,163353.0,51144,2138-07-17 20:48:00,0,0.0,%,
2,3,2,163353.0,51146,2138-07-17 20:48:00,0,0.0,%,
3,4,2,163353.0,51200,2138-07-17 20:48:00,0,0.0,%,




--------------------------------------------------
DataFrame: df_prescriptions
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTDATE,ENDDATE,DRUG_TYPE,DRUG,DRUG_NAME_POE,DRUG_NAME_GENERIC,FORMULARY_DRUG_CD,GSN,NDC,PROD_STRENGTH,DOSE_VAL_RX,DOSE_UNIT_RX,FORM_VAL_DISP,FORM_UNIT_DISP,ROUTE
0,2968761,2,163353,243653.0,2138-07-18 00:00:00,2138-07-20 00:00:00,MAIN,NEO*IV*Gentamicin,,,GENT10I,9298.0,63323020000.0,10mg/mL-2mL,15.5,mg,0.775,VIAL,IV
1,2968759,2,163353,243653.0,2138-07-18 00:00:00,2138-07-20 00:00:00,BASE,Syringe (Neonatal) *D5W*,,,NEOSYRD5W,,0.0,1 Syringe,2.4,ml,2.4,ml,IV
2,2968762,2,163353,243653.0,2138-07-18 00:00:00,2138-07-21 00:00:00,MAIN,Ampicillin Sodium,,,AMP500I,8937.0,63323040000.0,500mg Vial,500.0,mg,1.0,VIAL,IV
3,2968760,2,163353,243653.0,2138-07-18 00:00:00,2138-07-21 00:00:00,BASE,Send 500mg Vial,,,AMPVL,,0.0,Send 500mg Vial,1.0,VIAL,1.0,VIAL,IV




--------------------------------------------------
DataFrame: df_inputevents_cv
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,RATEUOM,...,ORDERID,LINKORDERID,STOPPED,NEWBOTTLE,ORIGINALAMOUNT,ORIGINALAMOUNTUOM,ORIGINALROUTE,ORIGINALRATE,ORIGINALRATEUOM,ORIGINALSITE
0,592,24457,184834.0,205776.0,2193-09-11 09:00:00,30056,100.0,ml,,,...,756654,9359133,,,,ml,Oral,,,
1,593,24457,184834.0,205776.0,2193-09-11 12:00:00,30056,200.0,ml,,,...,3564075,9359133,,,,ml,Oral,,,
2,594,24457,184834.0,205776.0,2193-09-11 16:00:00,30056,160.0,ml,,,...,422646,9359133,,,,ml,Oral,,,
3,595,24457,184834.0,205776.0,2193-09-11 19:00:00,30056,240.0,ml,,,...,5137889,9359133,,,,ml,Oral,,,




--------------------------------------------------
DataFrame: df_inputevents_mv
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,AMOUNT,AMOUNTUOM,RATE,...,TOTALAMOUNTUOM,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE,ORIGINALAMOUNT,ORIGINALRATE
0,241,27063,139787,223259.0,2133-02-05 06:29:00,2133-02-05 08:45:00,225166,6.774532,mEq,,...,ml,0,0,1,Rewritten,,RN,2133-02-05 12:52:00,10.0,0.05
1,242,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225944,28.132997,ml,30.142497,...,ml,0,0,0,FinishedRunning,,,,28.132998,30.255817
2,243,27063,139787,223259.0,2133-02-05 05:34:00,2133-02-05 06:30:00,225166,2.8133,mEq,,...,ml,0,0,0,FinishedRunning,,,,2.8133,0.050426
3,244,27063,139787,223259.0,2133-02-03 12:00:00,2133-02-03 12:01:00,225893,1.0,dose,,...,ml,0,0,2,Rewritten,RN,,2133-02-03 17:06:00,1.0,1.0




--------------------------------------------------
DataFrame: df_outputevents
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,CHARTTIME,ITEMID,VALUE,VALUEUOM,STORETIME,CGID,STOPPED,NEWBOTTLE,ISERROR
0,1,24730,143418.0,225089.0,2144-07-09 19:15:00,40069,80.0,ml,2144-07-09 19:20:00,14990,,,
1,2,24730,143418.0,225089.0,2144-07-10 00:00:00,40069,100.0,ml,2144-07-09 23:51:00,20675,,,
2,3,24730,143418.0,225089.0,2144-07-10 02:00:00,40069,45.0,ml,2144-07-10 01:50:00,20675,,,
3,4,24730,143418.0,225089.0,2144-07-10 05:00:00,40069,45.0,ml,2144-07-10 04:58:00,20675,,,




--------------------------------------------------
DataFrame: df_procedureevents_mv
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STARTTIME,ENDTIME,ITEMID,VALUE,VALUEUOM,LOCATION,...,ORDERCATEGORYNAME,SECONDARYORDERCATEGORYNAME,ORDERCATEGORYDESCRIPTION,ISOPENBAG,CONTINUEINNEXTDEPT,CANCELREASON,STATUSDESCRIPTION,COMMENTS_EDITEDBY,COMMENTS_CANCELEDBY,COMMENTS_DATE
0,379,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225401,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
1,380,29070,115071,232563.0,2145-03-12 23:04:00,2145-03-12 23:05:00,225454,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,
2,381,29070,115071,232563.0,2145-03-12 23:05:00,2145-03-18 20:01:00,225792,8456.0,hour,,...,Ventilation,,Task,1,0,0,FinishedRunning,,,
3,382,29070,115071,232563.0,2145-03-12 23:36:00,2145-03-12 23:37:00,225402,1.0,,,...,Procedures,,Electrolytes,0,0,0,FinishedRunning,,,




--------------------------------------------------
DataFrame: df_microbiologyevents
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,SPEC_ITEMID,SPEC_TYPE_DESC,ORG_ITEMID,ORG_NAME,ISOLATE_NUM,AB_ITEMID,AB_NAME,DILUTION_TEXT,DILUTION_COMPARISON,DILUTION_VALUE,INTERPRETATION
0,766,104,164025,2184-10-17 00:00:00,2184-10-17 08:15:00,70014.0,BLOOD CULTURE - NEONATE,,,,,,,,,
1,771,106,145167,2192-08-09 00:00:00,2192-08-09 21:11:00,70062.0,SPUTUM,80023.0,STAPH AUREUS COAG +,1.0,90012.0,GENTAMICIN,<=0.5,<=,1.0,S
2,772,106,145167,2192-08-09 00:00:00,2192-08-09 21:11:00,70062.0,SPUTUM,80023.0,STAPH AUREUS COAG +,1.0,90016.0,OXACILLIN,0.5,=,1.0,S
3,773,106,145167,2192-08-09 00:00:00,2192-08-09 21:11:00,70062.0,SPUTUM,80023.0,STAPH AUREUS COAG +,1.0,90006.0,ERYTHROMYCIN,=>8,=>,8.0,R




--------------------------------------------------
DataFrame: df_noteevents
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
1,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
2,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
3,180,20646,134727.0,2112-12-10,,,Discharge summary,Report,,,Admission Date: [**2112-12-8**] ...




--------------------------------------------------
DataFrame: df_d_cpt
--------------------------------------------------


Unnamed: 0,ROW_ID,CATEGORY,SECTIONRANGE,SECTIONHEADER,SUBSECTIONRANGE,SUBSECTIONHEADER,CODESUFFIX,MINCODEINSUBSECTION,MAXCODEINSUBSECTION
0,1,1,99201-99499,Evaluation and management,99201-99216,Office/other outpatient services,,99201,99216
1,2,1,99201-99499,Evaluation and management,99217-99220,Hospital observation services,,99217,99220
2,3,1,99201-99499,Evaluation and management,99221-99239,Hospital inpatient services,,99221,99239
3,4,1,99201-99499,Evaluation and management,99241-99255,Consultations,,99241,99255




--------------------------------------------------
DataFrame: df_cptevents
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,COSTCENTER,CHARTDATE,CPT_CD,CPT_NUMBER,CPT_SUFFIX,TICKET_ID_SEQ,SECTIONHEADER,SUBSECTIONHEADER,DESCRIPTION
0,317,11743,129545,ICU,,99232,99232.0,,6.0,Evaluation and management,Hospital inpatient services,
1,318,11743,129545,ICU,,99232,99232.0,,7.0,Evaluation and management,Hospital inpatient services,
2,319,11743,129545,ICU,,99232,99232.0,,8.0,Evaluation and management,Hospital inpatient services,
3,320,11743,129545,ICU,,99232,99232.0,,9.0,Evaluation and management,Hospital inpatient services,




--------------------------------------------------
DataFrame: df_drgcodes
--------------------------------------------------


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,DRG_TYPE,DRG_CODE,DESCRIPTION,DRG_SEVERITY,DRG_MORTALITY
0,343,24958,162910,HCFA,110,MAJOR CARDIOVASCULAR PROCEDURES WITH COMPLICAT...,,
1,346,11113,157980,HCFA,390,NEONATE WITH OTHER SIGNIFICANT PROBLEMS,,
2,349,13933,194891,HCFA,101,OTHER RESPIRATORY SYSTEM DIAGNOSES WITH COMPLI...,,
3,353,21397,100792,HCFA,78,PULMONARY EMBOLISM,,






In [2]:
import torch
# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    # Get the name of the GPU
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ Success! CUDA is available.")
    print(f"Device: {gpu_name}")
    # Set the device to the GPU for training
    device = torch.device("cuda")
else:
    print("❌ CUDA not available. The model will train on the CPU (this will be very slow).")
    device = torch.device("cpu")
# When you create or load your model, you would then move it to this device
# For example:
# model.to(device)

✅ Success! CUDA is available.
Device: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [3]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

Torch version: 2.5.1+cu121
CUDA available: True
CUDA device count: 1
Device name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [3]:
# =============================================================================
# CELL 1: Installations & Imports
# =============================================================================
# This first cell handles all necessary installations and imports.
# The '!' command runs shell commands directly from the notebook.

print("Installing required packages...")
# Using '-q' for a quieter installation
!pip install -q "fhir.resources>=7.1.0" weasyprint scispacy spacy

import os
import spacy
import scispacy # Note: scispacy must be imported after spacy
import subprocess
import sys
from datetime import date

# Import FHIR resource models and other necessary components
from fhir.resources.bundle import Bundle
from fhir.resources.patient import Patient
from fhir.resources.condition import Condition
from fhir.resources.medicationrequest import MedicationRequest
from fhir.resources.observation import Observation
from fhir.resources.practitioner import Practitioner
from fhir.resources.humanname import HumanName
# REMOVED the unused FHIRDate import that was causing the error.
from weasyprint import HTML

print("\n✅ All libraries installed and imported successfully.")

Installing required packages...

✅ All libraries installed and imported successfully.


In [4]:
# =============================================================================
# CELL 2: Configuration and Model Download Function
# =============================================================================
# This cell sets up the configuration for the spaCy model and includes a
# helper function to download it if it's not already installed.

# --- Configuration: scispaCy Model ---
# This should match the name of the model inside the tar.gz file
SCISPACY_MODEL = "en_core_sci_lg"
# UPDATED to point to your local file.
# Make sure this filename is exactly correct and in the same directory.
LOCAL_MODEL_PATH = "en_core_sci_lg-0.5.0.tar.gz"

def install_spacy_model_if_not_exists(model_name, local_path):
    """Checks if a spaCy model is installed and installs it from a local file if not."""
    try:
        spacy.load(model_name)
        print(f"✅ Model '{model_name}' is already installed.")
    except OSError:
        print(f"Model '{model_name}' not found. Attempting to install from local file: '{local_path}'...")
        if not os.path.exists(local_path):
            print(f"❌ ERROR: The file '{local_path}' was not found in the same directory as the notebook.")
            print("Please make sure the .tar.gz file is placed correctly.")
            return
            
        # In a notebook, we run pip directly to install from the local package
        !pip install -q {local_path}
        print(f"✅ Successfully installed model from '{local_path}'.")

# Run the check/install function immediately using the local file path
install_spacy_model_if_not_exists(SCISPACY_MODEL, LOCAL_MODEL_PATH)

# Load the NLP model into a global variable for other cells to use
print("\nLoading spaCy model...")
try:
    nlp = spacy.load(SCISPACY_MODEL)
    print("✅ Model loaded successfully.")
except Exception as e:
    print(f"❌ Failed to load the spaCy model. Error: {e}")
    print("This might happen if the installation failed or the model name is incorrect.")



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


✅ Model 'en_core_sci_lg' is already installed.

Loading spaCy model...
✅ Model loaded successfully.


In [7]:
# Make sure the 'dataframes' dictionary and the 'df_noteevents' key exist
if 'dataframes' in locals() and 'df_noteevents' in dataframes:
    df_notes = dataframes['df_noteevents']
    
    # Select the first note from the DataFrame. You can change the index to select a different note.
    # For example, use iloc[10] for the eleventh note.
    if not df_notes.empty:
        target_note_index = 0
        ehr_note_text = df_notes.iloc[target_note_index]['TEXT']
        
        print(f"--- Successfully selected note at index {target_note_index} ---")
        print("\nPreview of the note text:")
        print("--------------------------------------------------")
        print(ehr_note_text[:1000] + "...") # Print the first 1000 characters
        print("--------------------------------------------------")

    else:
        print("❌ ERROR: The 'df_noteevents' DataFrame is empty.")
else:
    print("❌ ERROR: The 'dataframes' dictionary or 'df_noteevents' is not available. Please run the data loading cell first.")


--- Successfully selected note at index 0 ---

Preview of the note text:
--------------------------------------------------
Admission Date:  [**2118-6-2**]       Discharge Date:  [**2118-6-14**]

Date of Birth:                    Sex:  F

Service:  MICU and then to [**Doctor Last Name **] Medicine

HISTORY OF PRESENT ILLNESS:  This is an 81-year-old female
with a history of emphysema (not on home O2), who presents
with three days of shortness of breath thought by her primary
care doctor to be a COPD flare.  Two days prior to admission,
she was started on a prednisone taper and one day prior to
admission she required oxygen at home in order to maintain
oxygen saturation greater than 90%.  She has also been on
levofloxacin and nebulizers, and was not getting better, and
presented to the [**Hospital1 18**] Emergency Room.

In the [**Hospital3 **] Emergency Room, her oxygen saturation was
100% on CPAP.  She was not able to be weaned off of this
despite nebulizer treatment and Solu-Medrol 1

In [8]:
import re

def extract_entities_from_text(text, nlp_model):
    """
    Uses a scispaCy model for general NER and custom regex for specific details.
    """
    print("\n--- Running Step 1: Extracting Entities from Note ---")
    
    # Initialize a dictionary to hold the extracted data
    extracted = {
        "patient_name": {"given": "Unknown", "family": "Patient"},
        "gender": "unknown",
        "admission_date": None,
        "lab_results": [],
        "general_entities": {}
    }

    # --- Part A: Rule-based extraction for demographics (using regex) ---
    # Note: MIMIC-III notes are de-identified, so real names/dates are placeholders.
    # These patterns are examples for a typical EHR note format.
    
    # Example pattern for a name like "Last, First"
    name_match = re.search(r"Patient Name:\s*([A-Za-z\-]+),\s*([A-Za-z\-]+)", text, re.IGNORECASE)
    if name_match:
        extracted["patient_name"]['family'] = name_match.group(1).strip()
        extracted["patient_name"]['given'] = name_match.group(2).strip()
        print(f"Found Patient Name: {extracted['patient_name']['given']} {extracted['patient_name']['family']}")

    # Example pattern for gender
    gender_match = re.search(r"\d{1,3}-year-old\s+(female|male|woman|man)", text, re.IGNORECASE)
    if gender_match:
        gender = gender_match.group(1).lower()
        extracted["gender"] = "female" if gender in ["female", "woman"] else "male"
        print(f"Found Gender: {extracted['gender']}")

    # Example pattern for a date like "[**YYYY-MM-DD**]"
    adm_date_match = re.search(r"Admission Date:\s*\[\*\*(.*?)\*\*\]", text, re.IGNORECASE)
    if adm_date_match:
        date_str = adm_date_match.group(1).strip().replace('-', '/')
        extracted["admission_date"] = date_str
        print(f"Found Admission Date: {extracted['admission_date']}")

    # --- Part B: Model-based extraction for general medical entities ---
    doc = nlp_model(text)
    for ent in doc.ents:
        label = ent.label_
        if label not in extracted["general_entities"]:
            extracted["general_entities"][label] = []
        # Add entity only if it's not already in the list to avoid duplicates
        if ent.text.strip() not in extracted["general_entities"][label]:
            extracted["general_entities"][label].append(ent.text.strip())

    # --- Part C: Pattern-based extraction for common Lab Values ---
    lab_pattern = re.compile(r"(White count|hematocrit|platelets|Troponin|CKs|sodium|potassium|chloride|bicarbonate|BUN|creatinine)\s+of\s+([0-9]+\.?[0-9]*)", re.IGNORECASE)
    for match in lab_pattern.finditer(text):
        lab_name = match.group(1).strip().title()
        lab_value = match.group(2).strip()
        extracted["lab_results"].append({"name": lab_name, "value": lab_value})
        print(f"Found Lab Result: {lab_name} = {lab_value}")
            
    print(f"✅ Finished Extraction: Found {len(doc.ents)} total general entities.")
    return extracted


In [69]:
from fhir.resources.bundle import Bundle
from fhir.resources.patient import Patient
from fhir.resources.condition import Condition
from fhir.resources.medicationrequest import MedicationRequest
from fhir.resources.observation import Observation
from fhir.resources.humanname import HumanName
from fhir.resources.encounter import Encounter
from fhir.resources.coding import Coding
from fhir.resources.codeableconcept import CodeableConcept
import json

def structure_as_fhir_bundle(extracted_data):
    """
    Maps the detailed extracted data to FHIR resources and bundles them.
    DEFINITIVE REWRITE: This version constructs all resources as raw Python dictionaries 
    first, mirroring the final FHIR JSON structure. The entire bundle is then validated 
    in a single, robust step to bypass the persistent validation bugs in the library.
    """
    print("\n--- Running Step 2: Structuring Data into FHIR Format ---")
    
    # --- Step A: Build all resources as raw Python dictionaries ---

    # Patient Dictionary
    patient_dict = {
        "resourceType": "Patient",
        "id": "patient-1",
        "name": [{
            "family": extracted_data.get("patient_name", {}).get("family", "Unknown"),
            "given": [extracted_data.get("patient_name", {}).get("given", "Patient")]
        }],
        "gender": extracted_data.get("gender", "unknown")
    }
    
    # Encounter Dictionary
    encounter_dict = {
        "resourceType": "Encounter",
        "id": "encounter-1",
        "status": "finished",
        "subject": {"reference": f"Patient/{patient_dict['id']}"},
        # DEFINITIVE FIX: The traceback confirms the validator expects the 'class' field 
        # to be a LIST containing a coding dictionary. This resolves the error.
        "class": [{
            "system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
            "code": "IMP",
            "display": "inpatient encounter"
        }]
    }
    if extracted_data.get("admission_date"):
        try:
            parts = extracted_data["admission_date"].split('/')
            fhir_date = f"{parts[0]}-{int(parts[1]):02d}-{int(parts[2]):02d}"
            encounter_dict["period"] = {"start": fhir_date}
        except (IndexError, ValueError) as e:
            print(f"⚠️ Warning: Could not parse admission date '{extracted_data.get('admission_date')}'. Error: {e}")

    all_resource_dicts = [patient_dict, encounter_dict]
    resource_counter = 0

    # Condition Dictionaries
    entities = extracted_data.get("general_entities", {})
    disease_keys = ['DISEASE', 'ENTITY'] 
    for key in disease_keys:
        if key in entities:
            for disease_name in entities[key]:
                condition_dict = {
                    "resourceType": "Condition",
                    "id": f"condition-{resource_counter}",
                    "subject": {"reference": f"Patient/{patient_dict['id']}"},
                    "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                    "code": {"text": disease_name},
                    "clinicalStatus": { "coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active"}] }
                }
                all_resource_dicts.append(condition_dict)
                resource_counter += 1

    # MedicationRequest Dictionaries
    if 'CHEMICAL' in entities:
        for drug_name in entities['CHEMICAL']:
            med_req_dict = {
                "resourceType": "MedicationRequest",
                "id": f"medreq-{resource_counter}",
                "subject": {"reference": f"Patient/{patient_dict['id']}"},
                "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                "medicationCodeableConcept": {"text": drug_name},
                "status": "active",
                "intent": "order"
            }
            all_resource_dicts.append(med_req_dict)
            resource_counter += 1
            
    # Observation Dictionaries
    for lab in extracted_data.get("lab_results", []):
        try:
            observation_dict = {
                "resourceType": "Observation",
                "id": f"obs-{resource_counter}",
                "status": "final",
                "subject": {"reference": f"Patient/{patient_dict['id']}"},
                "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                "code": {"text": lab.get("name")},
                "valueQuantity": {"value": float(lab.get("value"))}
            }
            all_resource_dicts.append(observation_dict)
            resource_counter += 1
        except (ValueError, TypeError) as e:
            print(f"⚠️ Warning: Skipping lab result due to invalid value: {lab}. Error: {e}")

    # --- Step B: Assemble the final bundle dictionary ---
    bundle_entries = [
        {"fullUrl": f"urn:uuid:{res['id']}", "resource": res} for res in all_resource_dicts
    ]
    final_bundle_dict = {
        "resourceType": "Bundle",
        "type": "collection",
        "entry": bundle_entries
    }

    # --- Step C: Validate the entire structure in one go and return the object ---
    try:
        fhir_bundle = Bundle.model_validate(final_bundle_dict)
        print(f"✅ FHIR Bundle created and validated with {len(all_resource_dicts)} resources.")
        return fhir_bundle
    except Exception as e:
        print("❌ A validation error occurred even with the new approach.")
        print("--- Generated Bundle Dictionary (for debugging) ---")
        print(json.dumps(final_bundle_dict, indent=2))
        print("--- Validation Error ---")
        raise e



In [70]:
from weasyprint import HTML
from datetime import date

def generate_pdf_report(bundle, output_filename="FHIR_Clinical_Report.pdf"):
    """
    Generates a PDF report from a FHIR Bundle using an HTML template.
    """
    print("\n--- Running Step 3: Generating PDF Report ---")
    
    data = {
        "patient_name": "N/A", "patient_gender": "N/A", "admission_date": "N/A",
        "report_date": date.today().strftime("%B %d, %Y"),
        "conditions": [], "medications": [], "lab_results": []
    }

    for entry in bundle.entry:
        resource = entry.get('resource', {})
        resource_type = resource.get('resourceType')
        
        if resource_type == 'Patient':
            if 'name' in resource and resource['name']:
                name_data = resource['name'][0]
                data['patient_name'] = f"{' '.join(name_data.get('given', []))} {name_data.get('family', '')}"
            if 'gender' in resource:
                data['patient_gender'] = resource.get('gender', 'N/A').capitalize()
                
        elif resource_type == 'Encounter':
            if 'period' in resource and 'start' in resource['period']:
                data['admission_date'] = resource['period']['start']

        elif resource_type == 'Condition':
            data['conditions'].append(resource.get('code', {}).get('text', 'N/A'))
                
        elif resource_type == 'MedicationRequest':
            data['medications'].append(resource.get('medicationCodeableConcept', {}).get('text', 'N/A'))

        elif resource_type == 'Observation':
            lab_name = resource.get('code', {}).get('text', 'N/A')
            lab_value = resource.get('valueQuantity', {}).get('value', 'N/A')
            data['lab_results'].append(f"{lab_name}: {lab_value}")

    # Use an f-string to create the HTML content
    html_template = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>FHIR Clinical Report</title>
        <style>
            body {{ font-family: sans-serif; margin: 2em; color: #333; }}
            h1 {{ color: #0b3d91; border-bottom: 2px solid #0b3d91; padding-bottom: 5px;}}
            h2 {{ color: #0b3d91; border-bottom: 1px solid #ccc; padding-bottom: 3px; margin-top: 25px;}}
            .header {{ margin-bottom: 30px; }} .section {{ margin-bottom: 20px; }}
            .info-grid {{ display: grid; grid-template-columns: 1fr 1fr; gap: 0px 20px;}}
            .info-grid p {{ margin: 5px 0; }}
            ul {{ list-style-type: disc; padding-left: 20px; }}
            li {{ margin-bottom: 8px; font-size: 1.1em; }}
            .footer {{ position: fixed; bottom: 0; width: 100%; text-align: center; font-size: 0.8em; color: #888;}}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>Clinical Summary Report</h1>
            <div class="info-grid">
                <div><p><strong>Patient:</strong> {data['patient_name']}</p></div>
                <div><p><strong>Gender:</strong> {data['patient_gender']}</p></div>
                <div><p><strong>Admission Date:</strong> {data['admission_date']}</p></div>
                <div><p><strong>Report Date:</strong> {data['report_date']}</p></div>
            </div>
        </div>
        <div class="section">
            <h2>Conditions & Diagnoses</h2>
            <ul>{''.join(f'<li>{c}</li>' for c in sorted(set(data["conditions"]))) or '<li>No conditions identified.</li>'}</ul>
        </div>
        <div class="section">
            <h2>Medications</h2>
            <ul>{''.join(f'<li>{m}</li>' for m in sorted(set(data["medications"]))) or '<li>No medications identified.</li>'}</ul>
        </div>
        <div class="section">
            <h2>Laboratory Results</h2>
            <ul>{''.join(f'<li>{l}</li>' for l in data['lab_results']) or '<li>No lab results identified.</li>'}</ul>
        </div>
        <div class="footer">Generated by Clinical NLP Pipeline at {date.today().strftime('%Y-%m-%d')}</div>
    </body>
    </html>
    """
    
    HTML(string=html_template).write_pdf(output_filename)
    print(f"✅ Successfully created PDF report: '{output_filename}'")


In [71]:
# --- Run the Full Pipeline ---

# Ensure the note text was loaded successfully in the first cell
if 'ehr_note_text' in locals() and ehr_note_text and 'nlp' in locals():
    print("🚀 Starting EHR to FHIR Processing Pipeline...")

    # Step 1: Extract entities from the selected note
    extracted_data = extract_entities_from_text(ehr_note_text, nlp)

    # Step 2: Structure the extracted data into a FHIR Bundle
    fhir_bundle = structure_as_fhir_bundle(extracted_data)

    # Optional: Save the FHIR JSON bundle to a file for inspection or debugging
    output_json_path = "fhir_bundle_output.json"
    with open(output_json_path, "w") as f:
        f.write(fhir_bundle.json(indent=2))
        print(f"ℹ️  FHIR Bundle saved to '{output_json_path}' for review.")

    # Step 3: Generate the final PDF report from the bundle
    output_pdf_path = "Patient_Clinical_Report.pdf"
    generate_pdf_report(fhir_bundle, output_pdf_path)

    print(f"\n🎉 Pipeline finished successfully! Check for the generated files:")
    print(f"   - {output_json_path}")
    print(f"   - {output_pdf_path}")
else:
    print("❌ Cannot run pipeline. Please ensure the note selection cell (Cell 1) and model loading ran successfully.")


🚀 Starting EHR to FHIR Processing Pipeline...

--- Running Step 1: Extracting Entities from Note ---
Found Gender: female
Found Admission Date: 2118/6/2
✅ Finished Extraction: Found 585 total general entities.

--- Running Step 2: Structuring Data into FHIR Format ---
❌ A validation error occurred even with the new approach.
--- Generated Bundle Dictionary (for debugging) ---
{
  "resourceType": "Bundle",
  "type": "collection",
  "entry": [
    {
      "fullUrl": "urn:uuid:patient-1",
      "resource": {
        "resourceType": "Patient",
        "id": "patient-1",
        "name": [
          {
            "family": "Patient",
            "given": [
              "Unknown"
            ]
          }
        ],
        "gender": "female"
      }
    },
    {
      "fullUrl": "urn:uuid:encounter-1",
      "resource": {
        "resourceType": "Encounter",
        "id": "encounter-1",
        "status": "finished",
        "subject": {
          "reference": "Patient/patient-1"
        },


ValidationError: 4 validation errors for Bundle
entry.1.resource.class.0.system
  Extra inputs are not permitted [type=extra_forbidden, input_value='http://terminology.hl7.org/CodeSystem/v3-ActCode', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
entry.1.resource.class.0.code
  Extra inputs are not permitted [type=extra_forbidden, input_value='IMP', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
entry.1.resource.class.0.display
  Extra inputs are not permitted [type=extra_forbidden, input_value='inpatient encounter', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
entry.1.resource.period
  Extra inputs are not permitted [type=extra_forbidden, input_value={'start': '2118-06-02'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden

## ------------------------------------USing BERT------------------------------------------------

In [2]:
# =============================================================================
# CELL 1: Installations & Imports (BERT Version)
# =============================================================================
print("Installing required packages for BERT, FHIR, and PDF generation...")
# Using '-q' for a quieter installation
%pip install -q "transformers>=4.21.0" "torch>=1.9.0" "sentencepiece"
%pip install -q "fhir.resources>=7.1.0" weasyprint ipywidgets

import os
import re
import json
from datetime import date

# Import the Hugging Face pipeline for easy model inference
from transformers import pipeline

# Import FHIR resource models
from fhir.resources.bundle import Bundle
from fhir.resources.patient import Patient
from fhir.resources.condition import Condition
from fhir.resources.medicationrequest import MedicationRequest
from fhir.resources.observation import Observation
from fhir.resources.encounter import Encounter

# Import for PDF generation
from weasyprint import HTML

print("\n✅ All libraries installed and imported successfully.")



Installing required packages for BERT, FHIR, and PDF generation...

✅ All libraries installed and imported successfully.


In [16]:
# =============================================================================
# CELL 2: Load BioBERT NER Model
# =============================================================================
print("Loading BioBERT NER model from Hugging Face...")
print("This may take a few minutes on the first run as the model is downloaded.")

try:
    # Use a pre-trained Biomedical NER model.
    # "aggregation_strategy='simple'" groups word pieces (e.g., "hyper" and "##tension") into a single entity.
    ner_pipeline = pipeline(
        "ner", 
        model="d4data/biomedical-ner-all", 
        aggregation_strategy="simple"
    )
    print("✅ BioBERT NER model loaded successfully.")
    
except Exception as e:
    print(f"❌ Failed to load the BioBERT model. Error: {e}")
    ner_pipeline = None


Loading BioBERT NER model from Hugging Face...
This may take a few minutes on the first run as the model is downloaded.


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 1d8907bc-ff17-4411-9be6-b81706603bf0)')' thrown while requesting HEAD https://huggingface.co/d4data/biomedical-ner-all/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Device set to use cuda:0


✅ BioBERT NER model loaded successfully.


In [12]:
# =============================================================================
# CELL 3: Extraction & FHIR Conversion (Fixed for Strings)
# =============================================================================
import re
from datetime import datetime

def safe_search(pattern, text, group=1):
    """Helper function to safely perform a regex search and return None if not found."""
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(group).strip()
    return None

def extract_entities_from_any_report(report_text, ner_pipeline=None):
    """
    Extract patient info, lab results, conditions, medications, and entities using BioBERT.
    Returns a dictionary compatible with structure_as_fhir_bundle().
    """
    print("\n--- Running Step 1: Extracting Entities from Report (BERT) ---")

    extracted_data = {
        "patient_name": None,
        "gender": None,
        "admission_date": None,
        "general_entities": {},
        "lab_results": []
    }

    if "Sunrise Multispeciality Hospital" in report_text:
        print("Detected 'Sunrise Hospital' format.")
        # --- Extract Demographics ---
        extracted_data["patient_name"] = safe_search(r"Name:\s*(.*?)(?:\s*Patient ID:|\n)", report_text)
        extracted_data["gender"] = safe_search(r"Gender:\s*(.*?)(?:\s*Age:|\n)", report_text).lower()
        extracted_data["admission_date"] = safe_search(r"Sample Collected:\s*(.*?)(?:\s*Reported On:|\n)", report_text)

        # --- Lab Results ---
        results_section = safe_search(r"Laboratory Results:\s*(.*?)(?:Conditions & Diagnoses|\Z)", report_text)
        if results_section:
            lab_pattern = re.compile(r"([^:]+):\s*([\d.]+)\s*([a-zA-Z/µLdL%]+)\s*\(([\d.\s–-]+)\)")
            for line in results_section.strip().split('\n'):
                match = lab_pattern.match(line.strip())
                if match:
                    ref_range = [val.strip() for val in match.group(4).split('–')]
                    extracted_data["lab_results"].append({
                        "name": match.group(1).strip(),
                        "value": float(match.group(2)),
                        "unit": match.group(3).strip(),
                        "low": float(ref_range[0]),
                        "high": float(ref_range[1])
                    })

        # --- Conditions ---
        cond_section = safe_search(r"Conditions & Diagnoses \(from Interpretation\):\s*(.*?)(?:Medications|\Z)", report_text)
        if cond_section and "none identified" not in cond_section.lower():
            extracted_data["general_entities"]["Disease_disorder"] = [c.strip() for c in cond_section.split(',')]

        # --- Medications ---
        med_section = safe_search(r"Medications \(from Interpretation\):\s*(.*?)(?:Interpretation|\Z)", report_text)
        if med_section and "none identified" not in med_section.lower():
            extracted_data["general_entities"]["Drug_or_compound"] = [m.strip() for m in med_section.split(',')]

    else:
        print("⚠️ Warning: Unknown report format. Skipping extraction.")
        return extracted_data

    # --- Optional: Extract entities from Interpretation using BioBERT ---
    if ner_pipeline:
        interp_text = safe_search(r"Interpretation:\s*(.*)", report_text)
        if interp_text:
            ner_results = ner_pipeline(interp_text)
            for res in ner_results:
                label = res['entity_group']
                if label not in extracted_data["general_entities"]:
                    extracted_data["general_entities"][label] = []
                extracted_data["general_entities"][label].append(res['word'])
            print(f"✅ Finished Extraction: Found {len(ner_results)} entities in the interpretation.")
        else:
            print("✅ Finished Extraction: No interpretation section found.")
    else:
        print("✅ Finished Extraction: BioBERT pipeline not provided, skipped NER.")

    print(f"Found Patient Name: {extracted_data['patient_name']} "
          f"Gender: {extracted_data['gender']} "
          f"Report Date: {extracted_data['admission_date']}")

    return extracted_data

In [17]:
def structure_as_fhir_bundle(extracted_data):
    """
    Maps extracted data to FHIR resources by building a pure Python dictionary
    that mirrors the FHIR JSON structure.
    """
    print("\n--- Running Step 2: Structuring Data into FHIR Format (JSON-First) ---")
    
    # Extract patient name parts
    full_name = extracted_data.get("patient_name", "Unknown Patient")
    name_parts = full_name.split()
    given_names = name_parts[:-1] if len(name_parts) > 1 else name_parts
    family_name = name_parts[-1] if len(name_parts) > 1 else "Patient"
    
    # Patient resource
    patient_dict = {
        "resourceType": "Patient",
        "id": extracted_data.get("patient_id", "patient-1"),
        "name": [{"family": family_name, "given": given_names}],
        "gender": extracted_data.get("gender", "unknown")
    }
    
    # Encounter resource
    encounter_dict = {
        "resourceType": "Encounter",
        "id": "encounter-1",
        "status": "finished",
        "subject": {"reference": f"Patient/{patient_dict['id']}"},
        "class": [{"system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
                   "code": "IMP", "display": "inpatient encounter"}]
    }
    if extracted_data.get("admission_date"):
        try:
            parts = extracted_data["admission_date"].split('-')
            fhir_date = f"{parts[0]}-{int(parts[1]):02d}-{int(parts[2]):02d}"
            encounter_dict["period"] = {"start": fhir_date}
        except (IndexError, ValueError) as e:
            print(f"⚠️ Warning: Could not parse admission date '{extracted_data.get('admission_date')}'. Error: {e}")

    # Aggregate all resources
    all_resource_dicts = [patient_dict, encounter_dict]
    resource_counter = 0
    
    # Add Conditions
    disease_keys = ['Disease_disorder', 'Symptom', 'Biological_structure']
    chemical_keys = ['Chemical', 'Drug_or_compound']
    entities = extracted_data.get("general_entities", {})
    
    for key in disease_keys:
        if key in entities:
            for name in entities[key]:
                condition_dict = {
                    "resourceType": "Condition",
                    "id": f"condition-{resource_counter}",
                    "subject": {"reference": f"Patient/{patient_dict['id']}"},
                    "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                    "code": {"text": name},
                    "clinicalStatus": { "coding": [{"system": "http://terminology.hl7.org/CodeSystem/condition-clinical", "code": "active"}]}
                }
                all_resource_dicts.append(condition_dict)
                resource_counter += 1

    # Add Medications
    for key in chemical_keys:
        if key in entities:
            for name in entities[key]:
                med_req_dict = {
                    "resourceType": "MedicationRequest",
                    "id": f"medreq-{resource_counter}",
                    "subject": {"reference": f"Patient/{patient_dict['id']}"},
                    "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                    "medicationCodeableConcept": {"text": name},
                    "status": "active", "intent": "order"
                }
                all_resource_dicts.append(med_req_dict)
                resource_counter += 1
            
    # Add Lab Observations
    for lab in extracted_data.get("lab_results", []):
        try:
            observation_dict = {
                "resourceType": "Observation",
                "id": f"obs-{resource_counter}",
                "status": "final",
                "subject": {"reference": f"Patient/{patient_dict['id']}"},
                "encounter": {"reference": f"Encounter/{encounter_dict['id']}"},
                "code": {"text": lab.get("name")},
                "valueQuantity": {"value": float(lab.get("value"))}
            }
            all_resource_dicts.append(observation_dict)
            resource_counter += 1
        except (ValueError, TypeError) as e:
            print(f"⚠️ Warning: Skipping lab result due to invalid value: {lab}. Error: {e}")

    # Build final FHIR Bundle
    bundle_entries = [{"fullUrl": f"urn:uuid:{res['id']}", "resource": res} for res in all_resource_dicts]
    final_bundle_dict = {"resourceType": "Bundle", "type": "collection", "entry": bundle_entries}
    
    print(f"✅ FHIR Bundle dictionary created with {len(all_resource_dicts)} resources.")
    return final_bundle_dict


In [19]:
# =============================================================================
# CELL 5: PDF Generation & Main Execution (Updated for Multi-Format)
# =============================================================================
from weasyprint import HTML
from datetime import date
import json

def generate_pdf_report_from_dict(bundle_dict, output_filename="FHIR_Report_BERT_Combined.pdf"):
    """
    Generates a PDF report from a FHIR Bundle dictionary.
    """
    print("\n--- Running Step 3: Generating Final PDF Report ---")
    
    data = { "patient_name": "N/A", "patient_gender": "N/A", "admission_date": "N/A",
             "report_date": date.today().strftime("%B %d, %Y"),
             "conditions": [], "medications": [], "lab_results": [] }
    
    if not bundle_dict or not bundle_dict.get('entry'):
        print("❌ Bundle dictionary is empty, cannot generate report.")
        return

    # Find the first patient and encounter to extract demographic data
    for entry in bundle_dict.get('entry', []):
        resource = entry.get('resource', {})
        if resource.get('resourceType') == 'Patient':
            name_data = resource.get('name', [{}])[0]
            data['patient_name'] = f"{' '.join(name_data.get('given', []))} {name_data.get('family', '')}"
            data['patient_gender'] = resource.get('gender', 'N/A').capitalize()
            break 
            
    for entry in bundle_dict.get('entry', []):
        resource = entry.get('resource', {})
        if resource.get('resourceType') == 'Encounter':
             period = resource.get('period', {})
             data['admission_date'] = period.get('start', 'N/A')
             break

    # Aggregate all conditions, medications, and labs from the entire bundle
    for entry in bundle_dict['entry']:
        resource = entry.get('resource', {})
        resource_type = resource.get('resourceType')
        
        if resource_type == 'Condition':
            code = resource.get('code', {})
            if code.get('text'):
                data['conditions'].append(code['text'])
        elif resource_type == 'MedicationRequest':
            med_code = resource.get('medicationCodeableConcept', {})
            if med_code.get('text'):
                data['medications'].append(med_code['text'])
        elif resource_type == 'Observation':
            lab_name = resource.get('code', {}).get('text', 'N/A')
            lab_value = resource.get('valueQuantity', {}).get('value', 'N/A')
            data['lab_results'].append(f"{lab_name}: {lab_value}")

    html_template = f"""
    <!DOCTYPE html><html><head><title>FHIR Clinical Report</title>
    <style> body {{ font-family: sans-serif; margin: 2em; }} h1 {{ color: #333; }}
    h2 {{ color: #557; border-bottom: 1px solid #ccc; }} .info-grid {{ display: grid; grid-template-columns: 1fr 1fr; }}
    li {{ margin-bottom: 8px; }} </style></head><body>
    <h1>Consolidated Clinical Summary Report (Generated by BioBERT)</h1>
    <div class="info-grid">
        <div><p><strong>Patient:</strong> {data['patient_name']}</p></div>
        <div><p><strong>Gender:</strong> {data['patient_gender']}</p></div>
        <div><p><strong>First Report Date Found:</strong> {data['admission_date']}</p></div>
        <div><p><strong>Report Date:</strong> {data['report_date']}</p></div>
    </div>
    <h2>Conditions & Diagnoses (from Interpretation)</h2>
    <ul>{''.join(f'<li>{c}</li>' for c in sorted(list(set(data["conditions"])))) or '<li>None identified.</li>'}</ul>
    <h2>Medications (from Interpretation)</h2>
    <ul>{''.join(f'<li>{m}</li>' for m in sorted(list(set(data["medications"])))) or '<li>None identified.</li>'}</ul>
    <h2>Laboratory Results</h2>
    <ul>{''.join(f'<li>{l}</li>' for l in sorted(list(set(data["lab_results"])))) or '<li>None identified.</li>'}</ul>
    </body></html>
    """
    
    HTML(string=html_template).write_pdf(output_filename)
    print(f"✅ Successfully created combined PDF report: '{output_filename}'")


# =============================================================================
# MAIN EXECUTION (FOR YOUR CUSTOM REPORTS)
# =============================================================================
print("🚀 Starting Custom Report Processing Pipeline (BERT Version)...")

if 'ner_pipeline' in locals() and ner_pipeline:
    
    # --- CONFIGURATION: PASTE YOUR REPORTS HERE ---
    my_reports = [
        """🏥 Sunrise Multispeciality Hospital

Patient Clinical Laboratory Report

Name: Aditya veer Singh
Patient ID: 0459
Gender: Male
Age: 34 years
Date of Birth: 2006-04-11
Address: Sikariganj, Gorakhpur, UP, 273211
Phone: +91-98230-12345

Report ID: LAB-BIO-98732
Department: Biochemistry
Test Name: Liver Function Test (LFT)
Sample Type: Serum
Sample Collected: 2025-10-21 08:35 AM
Reported On: 2025-10-21 10:20 AM
Referring Doctor: Dr. Kavita Rao (MD Pathology)

Laboratory Results:

Total Bilirubin: 0.8 mg/dL (0.2 – 1.2)

Direct Bilirubin: 0.2 mg/dL (0.0 – 0.3)

Indirect Bilirubin: 0.6 mg/dL (0.1 – 1.0)

SGOT (AST): 28 U/L (15 – 40)

SGPT (ALT): 32 U/L (10 – 45)

Alkaline Phosphatase: 94 U/L (40 – 120)

Total Protein: 7.1 g/dL (6.4 – 8.3)

Albumin: 4.3 g/dL (3.5 – 5.0)

Globulin: 2.8 g/dL (2.0 – 3.5)

A/G Ratio: 1.5 (1.0 – 2.0)

Conditions & Diagnoses (from Interpretation):

Mild hepatic dysfunction

Fatty liver (NAFLD suspected)

Medications (from Interpretation):

Ursodeoxycholic Acid 300 mg, once daily

Vitamin E 400 IU, once daily

Interpretation:

Liver function parameters are mostly within normal limits. Mild hepatic dysfunction noted. Clinical correlation recommended.
        """,
    ]
    print(f"Found {len(my_reports)} custom reports to process.\n")

    # This dictionary will store the combined data from all reports
    aggregated_data = {
        "patient_name": None, "gender": None, "admission_date": None,
        "general_entities": {}, "lab_results": []
    }

    # --- Step 1: Loop through each report and aggregate extracted data ---
    for index, report_text in enumerate(my_reports):
        print(f"--- Processing Report {index+1}/{len(my_reports)} ---")
        # DEFINITIVE FIX: Call the correct new function for the new report format
        extracted_data = extract_entities_from_any_report(report_text, ner_pipeline)

        # On the first note, capture the patient's demographic info
        if index == 0:
            aggregated_data["patient_name"] = extracted_data.get("patient_name")
            aggregated_data["gender"] = extracted_data.get("gender")
            aggregated_data["admission_date"] = extracted_data.get("admission_date")

        # Append entities and labs from the current report to the aggregate lists
        for key, value in extracted_data.get("general_entities", {}).items():
            if key not in aggregated_data["general_entities"]:
                aggregated_data["general_entities"][key] = []
            aggregated_data["general_entities"][key].extend(value)
        
        aggregated_data["lab_results"].extend(extracted_data.get("lab_results", []))

    print("\n✅ Finished processing all reports.")

    # --- Step 2: Structure the single, aggregated data into one FHIR Bundle ---
    fhir_bundle_dict = structure_as_fhir_bundle(aggregated_data)

    # --- Step 3: Generate one consolidated PDF report ---
    if fhir_bundle_dict:
        generate_pdf_report_from_dict(fhir_bundle_dict)
        
        # Optional: Save the final, combined FHIR JSON bundle
        output_json_path = "my_fhir_bundle_combined.json"
        with open(output_json_path, "w") as f:
            json.dump(fhir_bundle_dict, f, indent=2)
            print(f"ℹ️  Combined FHIR Bundle dictionary saved to '{output_json_path}'")
            
        print(f"\n🎉 Batch pipeline finished successfully!")
    else:
        print("\n❌ Pipeline stopped: The final FHIR Bundle dictionary could not be created.")
else:
    print("\n❌ ERROR: BioBERT model ('ner_pipeline') failed to initialize.")
    print("Please ensure the model loading cell has run successfully.")



🚀 Starting Custom Report Processing Pipeline (BERT Version)...
Found 1 custom reports to process.

--- Processing Report 1/1 ---

--- Running Step 1: Extracting Entities from Report (BERT) ---
Detected 'Sunrise Hospital' format.
✅ Finished Extraction: Found 6 entities in the interpretation.
Found Patient Name: Aditya veer Singh Gender: male Report Date: 2025-10-21 08:35 AM

✅ Finished processing all reports.

--- Running Step 2: Structuring Data into FHIR Format (JSON-First) ---
✅ FHIR Bundle dictionary created with 17 resources.

--- Running Step 3: Generating Final PDF Report ---
✅ Successfully created combined PDF report: 'FHIR_Report_BERT_Combined.pdf'
ℹ️  Combined FHIR Bundle dictionary saved to 'my_fhir_bundle_combined.json'

🎉 Batch pipeline finished successfully!


In [166]:
# =============================================================================
# CELL: BioBERT Model Evaluation on Custom Reports
# =============================================================================
import re
from sklearn.metrics import precision_score, recall_score, f1_score

print("🚀 Starting BioBERT Evaluation on Custom Reports...")

# --- CONFIGURATION: PASTE YOUR REPORTS AND GROUND TRUTH HERE ---
# Example: one report
reports = [
    """🏥 Sunrise Multispeciality Hospital
Patient Clinical Laboratory Report
Name: Aditya Singh
Patient ID: 0459
Gender: Male
Age: 34 years
Date of Birth: 2006-04-11
Address: Sikariganj, Gorakhpur, UP, 273211
Phone: +91-98230-12345
Report ID: LAB-BIO-98732
Department: Biochemistry
Test Name: Liver Function Test (LFT)
Sample Type: Serum
Sample Collected: 2025-10-21 08:35 AM
Reported On: 2025-10-21 10:20 AM
Referring Doctor: Dr. Kavita Rao (MD Pathology)
Laboratory Results:
Total Bilirubin: 0.8 mg/dL (0.2 – 1.2)
Direct Bilirubin: 0.2 mg/dL (0.0 – 0.3)
Indirect Bilirubin: 0.6 mg/dL (0.1 – 1.0)
SGOT (AST): 28 U/L (15 – 40)
SGPT (ALT): 32 U/L (10 – 45)
Alkaline Phosphatase: 94 U/L (40 – 120)
Total Protein: 7.1 g/dL (6.4 – 8.3)
Albumin: 4.3 g/dL (3.5 – 5.0)
Globulin: 2.8 g/dL (2.0 – 3.5)
A/G Ratio: 1.5 (1.0 – 2.0)
Conditions & Diagnoses (from Interpretation):
Mild hepatic dysfunction
Fatty liver (NAFLD suspected)
Medications (from Interpretation):
Ursodeoxycholic Acid 300 mg, once daily
Vitamin E 400 IU, once daily
Interpretation:
Liver function parameters are mostly within normal limits. Mild hepatic dysfunction noted. Clinical correlation recommended."""
]

# Example ground truth entities for evaluation: list of (start, end, label)
ground_truths = [
    [(34, 45, "Patient"), (271, 295, "Disease_disorder"), (296, 315, "Disease_disorder"),
     (316, 346, "Drug_or_compound"), (347, 356, "Drug_or_compound")]
]

# --- LABEL MAPPING: Map BioBERT model output to your desired categories ---
label_map = {
    "Medication": "Drug_or_compound",
    "Dosage": "Drug_or_compound",
    "Severity": "Disease_disorder",
    "Biological_structure": "Disease_disorder",
    "Diagnostic_procedure": "Procedure",
    # Add more mappings if necessary
}

# --- EVALUATION ---
all_precisions = []
all_recalls = []
all_f1s = []

for i, report in enumerate(reports):
    print(f"\n--- Evaluating Report {i+1} ---")
    
    # Run BioBERT NER
    predictions = ner_pipeline(report)  # list of dicts with start, end, entity_group
    print("Predicted Entities:")
    for p in predictions:
        print(f"({p['start']}, {p['end']}, '{p['entity_group']}')")

    # Map labels to match ground truth categories
    pred_tuples = [(p['start'], p['end'], label_map.get(p['entity_group'], p['entity_group'])) for p in predictions]
    
    # Flatten for evaluation: only labels per character index
    max_len = len(report)
    y_true = ["O"] * max_len
    y_pred = ["O"] * max_len
    
    # Apply ground truth labels
    for (start, end, label) in ground_truths[i]:
        for idx in range(start, end):
            y_true[idx] = label
    
    # Apply predicted labels
    for (start, end, label) in pred_tuples:
        for idx in range(start, end):
            y_pred[idx] = label
    
    # Compute precision, recall, f1
    precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
    
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1:.2f}")
    
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1s.append(f1)

# --- AGGREGATED METRICS ---
avg_precision = sum(all_precisions) / len(all_precisions)
avg_recall = sum(all_recalls) / len(all_recalls)
avg_f1 = sum(all_f1s) / len(all_f1s)

print("\n=== Aggregated Metrics Across All Reports ===")
print(f"Average Precision: {avg_precision:.2f}")
print(f"Average Recall: {avg_recall:.2f}")
print(f"Average F1-score: {avg_f1:.2f}")


🚀 Starting BioBERT Evaluation on Custom Reports...

--- Evaluating Report 1 ---
Predicted Entities:
(0, 1, 'Detailed_description')
(2, 9, 'Detailed_description')
(52, 62, 'Diagnostic_procedure')
(76, 79, 'Diagnostic_procedure')
(83, 88, 'Diagnostic_procedure')
(101, 103, 'Detailed_description')
(103, 105, 'Lab_value')
(106, 112, 'Diagnostic_procedure')
(114, 118, 'Age')
(119, 122, 'Diagnostic_procedure')
(124, 126, 'Date')
(127, 132, 'Age')
(148, 158, 'Date')
(174, 178, 'Detailed_description')
(184, 189, 'Detailed_description')
(195, 201, 'Lab_value')
(202, 207, 'Diagnostic_procedure')
(218, 219, 'Time')
(262, 279, 'Diagnostic_procedure')
(286, 305, 'Diagnostic_procedure')
(325, 337, 'Diagnostic_procedure')
(349, 352, 'Detailed_description')
(352, 359, 'Time')
(360, 368, 'Time')
(385, 392, 'Time')
(393, 401, 'Time')
(470, 485, 'Diagnostic_procedure')
(487, 496, 'Lab_value')
(509, 525, 'Diagnostic_procedure')
(527, 536, 'Lab_value')
(549, 557, 'Diagnostic_procedure')
(558, 567, 'Diagnos

## ----------hybrid(tesserat+easy)-OCR → BioBERT → FHIR pipeline--------------

In [1]:
# CELL 1: Minimal Setup with Essential Dependencies
print("--- CELL 1: Essential Setup ---")

# Restart and clear runtime if needed
import IPython
# Uncomment this line if you need to restart runtime first time around
# IPython.Application.instance().kernel.do_shutdown(True)

# Install only minimal essential packages
!pip install -q torch torchvision
!pip install -q transformers pdf2image Pillow
!pip install -q fhir.resources

# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Basic imports that don't depend on problematic packages
import os
import re
import json
import logging
import time
from datetime import datetime
from PIL import Image
from pdf2image import convert_from_path
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("✅ CELL 1 Complete: Essential setup finished.")

--- CELL 1: Essential Setup ---


  import pynvml  # type: ignore[import]


PyTorch version: 2.7.1+cu118
CUDA available: False
✅ CELL 1 Complete: Essential setup finished.


In [4]:
# CELL 2: Enhanced Medical Document OCR Setup
print("\n--- CELL 2: Setting up Enhanced OCR for Medical Documents ---")

# Install necessary packages (uncomment if not installed)
# !pip install easyocr pytesseract opencv-python-headless

import cv2
import numpy as np
import os
from PIL import Image, ImageEnhance

# For Windows users, set the path to Tesseract executable if needed
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Ensure temp folder exists
temp_folder = "temp"
os.makedirs(temp_folder, exist_ok=True)

def enhance_image(img_path, output_path=None):
    """Enhance image for better OCR results"""
    # Read image
    img = cv2.imread(img_path)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply adaptive threshold
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                  cv2.THRESH_BINARY, 11, 2)

    # Noise removal (optional)
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)

    if output_path:
        cv2.imwrite(output_path, opening)
        return output_path
    return opening

def run_medical_ocr(image_path):
    """Multi-engine OCR approach optimized for medical documents"""
    print(f"Processing image: {image_path}")

    # Create enhanced version of the image
    enhanced_path = os.path.join(temp_folder, "enhanced_medical_image.png")
    enhance_image(image_path, enhanced_path)

    # 1. EasyOCR
    try:
        print("Running EasyOCR...")
        import easyocr
        reader = easyocr.Reader(['en'])
        easy_result = reader.readtext(enhanced_path, detail=0, paragraph=True)
        easy_text = "\n".join(easy_result)
        print(f"EasyOCR extracted {len(easy_text)} characters")
    except Exception as e:
        print(f"EasyOCR failed: {e}")
        easy_text = ""

    # 2. Tesseract OCR
    try:
        print("Running Tesseract OCR...")
        custom_config = r'--oem 3 --psm 6 -l eng'
        tess_text = pytesseract.image_to_string(Image.open(enhanced_path), config=custom_config)
        print(f"Tesseract extracted {len(tess_text)} characters")
    except Exception as e:
        print(f"Tesseract failed: {e}")
        tess_text = ""

    # 3. Alternate Tesseract mode
    try:
        custom_config2 = r'--oem 3 --psm 4 -l eng'
        tess_text2 = pytesseract.image_to_string(Image.open(image_path), config=custom_config2)
    except:
        tess_text2 = ""

    # Combine results
    all_texts = [easy_text, tess_text, tess_text2]
    all_texts.sort(key=lambda x: len(x), reverse=True)
    combined_text = "\n".join(all_texts)

    # Save OCR results
    results_path = os.path.join(temp_folder, "all_ocr_results.txt")
    with open(results_path, "w", encoding="utf-8") as f:
        f.write("=== EASY OCR ===\n")
        f.write(easy_text)
        f.write("\n\n=== TESSERACT OCR 1 ===\n")
        f.write(tess_text)
        f.write("\n\n=== TESSERACT OCR 2 ===\n")
        f.write(tess_text2)

    print(f"OCR results saved to: {results_path}")
    return combined_text

print("✅ CELL 2 Complete: Enhanced Medical OCR setup finished.")



--- CELL 2: Setting up Enhanced OCR for Medical Documents ---
✅ CELL 2 Complete: Enhanced Medical OCR setup finished.


In [5]:
%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [6]:
# CELL 3: BioBERT NER Setup (IMPROVED)
print("\n--- CELL 3: Setting up BioBERT for Medical Entity Recognition ---")

# Install necessary packages (uncomment if not installed)
# !pip install transformers==4.31.0 torch
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


# Load BioBERT model for medical NER
try:
    print("Loading BioBERT model for medical entity recognition...")

    # Use a specialized biomedical NER model
    model_name = "d4data/biomedical-ner-all"  # Comprehensive biomedical NER model

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    print("Loading model...")
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # Create NER pipeline with GPU acceleration if available
    device = 0 if torch.cuda.is_available() else -1
    print(f"Creating NER pipeline (device: {'GPU' if device==0 else 'CPU'})...")
    biobert_ner = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

    # Get the list of entity labels this model can detect
    id2label = model.config.id2label
    entity_types = set(label.replace("B-", "").replace("I-", "") 
                       for label in id2label.values() if label not in ["O", "X", "PAD"])

    print(f"BioBERT loaded successfully! Model can detect these entity types: {', '.join(sorted(entity_types))}")

except Exception as e:
    print(f"❌ Error loading BioBERT model: {e}")
    biobert_ner = None

print("✅ CELL 3 Complete: BioBERT setup finished.")



--- CELL 3: Setting up BioBERT for Medical Entity Recognition ---
Loading BioBERT model for medical entity recognition...
Loading tokenizer...
Loading model...


Device set to use cpu


Creating NER pipeline (device: CPU)...
BioBERT loaded successfully! Model can detect these entity types: Activity, Administration, Age, Area, Biological_attribute, Biological_structure, Clinical_event, Color, Coreference, Date, Detailed_description, Diagnostic_procedure, Disease_disorder, Distance, Dosage, Duration, Family_history, Frequency, Height, History, Lab_value, Mass, Medication, Non[biological](Detailed_description, Nonbiological_location, Occupation, Other_entity, Other_event, Outcome, Personal_[back](Biological_structure, Personal_background, Qualitative_concept, Quantitative_concept, Severity, Sex, Shape, Sign_symptom, Subject, Texture, Therapeutic_procedure, Time, Volume, Weight
✅ CELL 3 Complete: BioBERT setup finished.


In [5]:
%pip uninstall torch -y

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Note: you may need to restart the kernel to use updated packages.


You can safely remove it manually.


In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [7]:
import torch
print(f"Is CUDA (GPU) available?  {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Is CUDA (GPU) available?  True
GPU Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [5]:
# CELL 4 (UNIFIED): Fine-Tuned Medical Document Extraction System (V3)
print("\n--- CELL 4: Defining Fine-Tuned Medical Document Extraction System (V3) ---")

import re
import json
import torch
from transformers import pipeline

def detect_document_type(text):
    """Determine document type based on content patterns"""
    if re.search(r"DISCHARGE\s+MEDICAL\s+REPORT|DISCHARGE\s+SUMMARY", text, re.IGNORECASE):
        if re.search(r"MAIN\s+COMPLAINT|DATE\s+OF\s+BIRTH|EXPLANATION\s+IN\s+DETAILS", text, re.IGNORECASE):
            return "discharge_summary"

    if re.search(r"COVID-19|PCR|TEST\s+NAME|RESULT|MOLECULAR\s+DIAGNOSTIC", text, re.IGNORECASE):
        if re.search(r"DETECTED|NOT\s+DETECTED|POSITIVE|NEGATIVE", text, re.IGNORECASE):
            return "lab_report"

    if re.search(r"HOSPITAL\s+COURSE|ICU|MEDICAL\s+REPORT", text, re.IGNORECASE):
        if re.search(r"DIAGNOSIS|MEDICATION|TREATMENT|ADMISSION", text, re.IGNORECASE):
            return "clinical_report"

    return "general_medical"

def extract_clinical_report_data(report_text, ner_pipeline=None):
    """Extract data from clinical/hospital reports"""
    print("Processing Clinical/Hospital Report...")

    extracted = {
        "document_type": "clinical_report",
        "patient_info": {},
        "report_meta": {},
        "conditions": [],
        "medications": [],
        "vitals": [],
        "investigations": [],
        "lab_results": [],
        "interpretation_text": ""
    }

    # --- Patient info with multiple patterns ---
    patient_patterns = {
        'name': [r"PATIENT\s*NAME\s*:?\s*([A-Za-z\s.-]+)(?:\n|$)",
                 r"NAME\s*:?\s*([A-Za-z\s.-]+)(?:\n|,|$)"],
        'id': [r"MRN\s*:?\s*([0-9]+)",
               r"MEDICAL RECORD\s*:?\s*([A-Za-z0-9-]+)"],
        'gender': [r"SEX\s*:?\s*([A-Za-z]+)",
                   r"GENDER\s*:?\s*([A-Za-z]+)",
                   r"(?:SEX|GENDER)\s*:?\s*(female|male|F|M)",
                   r"female\s+patient"],
        'age': [r"AGE\s*:?\s*(\d+\s*[YyOo]/[OoYy])",
                r"AGE\s*:?\s*(\d+\s*years?)",
                r"(\d+)\s*[Yy]/[Oo]"],
        'birthDate': [r"(?:DOB|Date of Birth)\s*:?\s*(\d{2}/\d{2}/\d{4})"], # FINE-TUNE V3: Specific format
        'nationality': [r"NATIONALITY\s*:?\s*([\w\s]+)"]
    }

    # Try each pattern for each field
    for field, patterns in patient_patterns.items():
        for pattern in patterns:
            value = safe_search(pattern, report_text)
            if value:
                value = re.sub(r'[^\w\s/.-]', '', value).strip()
                if field == 'gender' and pattern == r"female\s+patient" and value:
                    extracted["patient_info"][field] = "female"
                else:
                    extracted["patient_info"][field] = value
                break

    # --- Report Meta ---
    report_patterns = {
        'id': [r"REPORT ID\s*:?\s*([A-Za-z0-9-]+)"],
        'date': [r"DATE\s*:?\s*([\d/.-]+)"],
        'examination_date': [r"EXAMINATION DATE\s*:?\s*([\d/.-]+)"],
        'hospital': [r"(Blue\s+Island\s+Medical\s+Clinic)", # FINE-TUNE V3: Be specific
                     r"(.*HOSPITAL.*|.*CLINIC.*|.*MEDICAL CENTER.*)"],
        'doctor': [r"(?:DOCTOR|PHYSICIAN|DR\.)\s*:?\s*([A-Za-z\s.-]+)"]
    }

    for field, patterns in report_patterns.items():
        for pattern in patterns:
            value = safe_search(pattern, report_text)
            if value:
                value = re.sub(r'[^\w\s/.-]', '', value).strip()
                extracted["report_meta"][field] = value
                break

    # --- Extract vitals with precise patterns (FINE-TUNED V3) ---
    vital_patterns = {
        'BP': [r"(?:BP)\s*=\s*(\d+/\d+)"],
        'HR': [r"(?:HR)\s*=\s*(\d+\s*x/m[ni]t)"],
        'RR': [r"(?:RR)\s*=\s*(\d+\s*x/m[ni]t)"],
        # FINE-TUNE V3: Allow comma OR dot for decimal
        'TEMP': [r"(?:T)\s*=\s*([3-4][0-9][.,]\d)"],
        'GCS': [r"(?:GCS)\s*:\s*([A-Z0-9V]+)"],
    }

    for vital, patterns in vital_patterns.items():
        for pattern in patterns:
            # FINE-TUNE V3: Search the whole report_text, not a subsection
            matches = re.findall(pattern, report_text, re.IGNORECASE)
            for value in matches:
                # Use a set to avoid duplicate vitals
                if "vitals_set" not in extracted: extracted["vitals_set"] = set()
                if (vital, value) not in extracted["vitals_set"]:
                    extracted["vitals"].append({"name": vital, "value": value})
                    extracted["vitals_set"].add((vital, value))

    # --- DIAGNOSES extraction ---
    diagnosis_section = extract_section(report_text, ["DIAGNOSIS", "DIAGNOSES", "ASSESSMENT"],
                                        ["HOSPITAL COURSE", "PLAN", "TREATMENT"])
    if diagnosis_section:
        for line in diagnosis_section.split('\n'):
            line = line.strip()
            if line and len(line) > 3:
                clean_line = re.sub(r'^[-–•]+\s*', '', line).strip()
                if clean_line:
                    extracted["conditions"].append(clean_line)

    common_diagnoses = [
        "STROKE", "HEMORRHAGE", "INFARCT", "PNEUMONIA",
        "HYPERTENSION", "HTN", "DM", "DIABETES", "SEPSIS"
    ]

    for diagnosis in common_diagnoses:
        if re.search(rf'\b{re.escape(diagnosis)}\b', report_text, re.IGNORECASE):
            if not any(re.search(re.escape(diagnosis), cond, re.IGNORECASE) for cond in extracted["conditions"]):
                extracted["conditions"].append(diagnosis)

    # --- Extract medications (FINE-TUNED V3) ---
    # We will rely *only* on NER for medications, as the rule-based
    # method was conflicting with NER.

    # --- BIOBERT NER for additional entities ---
    if ner_pipeline:
        print("Running BioBERT NER for additional entities...")
        try:
            text_for_ner = report_text[:10000]
            ner_results = ner_pipeline(text_for_ner)
            biobert_entities = process_biobert_results(ner_results)

            for entity in biobert_entities:
                if entity["score"] < 0.7:
                    continue

                entity_type = entity["type"]
                entity_text = entity["text"]

                # FINE-TUNE V3: Increase length filter to avoid 'tam', 'para'
                if not entity_text or len(entity_text.strip()) < 5:
                    continue

                if entity_type in ["DISEASE", "PROBLEM", "Disease_disorder"]:
                    if entity_text.lower() not in [c.lower() for c in extracted["conditions"]]:
                        extracted["conditions"].append(entity_text)

                elif entity_type in ["DRUG", "CHEMICAL", "Medication"]:
                    if entity_text.lower() not in [m.lower() for m in extracted["medications"]]:
                        extracted["medications"].append(entity_text)

        except Exception as e:
            print(f"Error during BioBERT NER: {e}")

    # --- Extract hospital course as interpretation text ---
    hospital_course = extract_section(report_text, ["HOSPITAL COURSE", "COURSE", "SUMMARY"],
                                      ["END", "SIGNATURE", "PHYSICIAN"])
    if hospital_course:
        extracted["interpretation_text"] = hospital_course.strip()

    # --- Extract CT scan or other investigations ---
    ct_scan_match = re.search(r'CT\s+(?:brain|scan).*?(?:hemorrhage|infarct)',
                              report_text, re.IGNORECASE | re.DOTALL)
    if ct_scan_match:
        extracted["investigations"].append({
            "type": "CT Brain",
            "finding": ct_scan_match.group(0).strip()
        })
    
    # Clean up vitals set
    if "vitals_set" in extracted:
        del extracted["vitals_set"]

    return extracted

def extract_lab_report_data(report_text, ner_pipeline=None):
    """Extract data from laboratory test reports"""
    print("Processing Laboratory Test Report...")

    extracted = {
        "document_type": "lab_report",
        "patient_info": {},
        "lab_info": {},
        "sample_info": {},
        "test_results": []
    }

    # --- Extract patient information ---
    patient_patterns = {
        'name': [r"(?:Patient|Name)\s*:?\s*([^\n:]+)(?=\n|$|:)",
                 r"Name\s*:\s*([^\n]+)"],
        'age': [r"Age\s*:?\s*(\d+)\s*(?:Y|Yr|Yrs|Years)",
                r"Age\s*:?\s*(\d+)"],
        'gender': [r"(?:Gender|Sex)\s*:?\s*(Male|Female|M|F)",
                   r"Sex\s*:\s*([^\n]+)"],
        'id': [r"(?:ID|Patient ID|P\.?\s*ID)\s*\.?\s*(?:No\.?)?:?\s*([A-Za-z0-9]+)",
               r"P\. ID No\.\s*:\s*([^\n]+)"],
        'dob': [r"(?:DOB|Date of Birth)\s*:?\s*(\d{2}/\d{2}/\d{4})"] # FINE-TUNE V3: Specific format
    }

    for field, patterns in patient_patterns.items():
        for pattern in patterns:
            match = re.search(pattern, report_text, re.IGNORECASE)
            if match:
                extracted["patient_info"][field] = match.group(1).strip()
                break

    # --- Lab information ---
    lab_patterns = {
        'name': [r"(?:Lab|Laboratory)\s*:?\s*([^\n]+)(?=\n|$)",
                 r"(?:Processed by|Reported by)\s*:?\s*([^\n]+)(?=\n|$)"],
        'address': [r"(?:Address|Location)\s*:?\s*([^\n]+)(?=\n|$)"],
        'report_id': [r"(?:Report ID|Accession No|Accession Number)\s*:?\s*([A-Za-z0-9]+)"]
    }

    for field, patterns in lab_patterns.items():
        for pattern in patterns:
            match = re.search(pattern, report_text, re.IGNORECASE)
            if match:
                extracted["lab_info"][field] = match.group(1).strip()
                break

    # --- Sample information ---
    sample_patterns = {
        'collected_date': [r"(?:Sample|Specimen)\s*Collected\s*(?:on|date)\s*:?\s*([\d/.-]+\s*[\d:]+\s*(?:AM|PM)?)",
                           r"(?:Sample|Specimen)\s*Collected\s*(?:on|date)\s*:?\s*([\d/.-]+)"],
        'received_date': [r"(?:Sample|Specimen)\s*Received\s*(?:on|date)\s*:?\s*([\d/.-]+\s*[\d:]+\s*(?:AM|PM)?)",
                          r"(?:Sample|Specimen)\s*Received\s*(?:on|date)\s*:?\s*([\d/.-]+)"],
        'reported_date': [r"(?:Report|Result)\s*(?:Released|Reported|Date)\s*(?:on|date)\s*:?\s*([\d/.-]+\s*[\d:]+\s*(?:AM|PM)?)",
                          r"(?:Report|Result)\s*(?:Released|Reported|Date)\s*(?:on|date)\s*:?\s*([\d/.-]+)"]
    }

    for field, patterns in sample_patterns.items():
        for pattern in patterns:
            match = re.search(pattern, report_text, re.IGNORECASE)
            if match:
                extracted["sample_info"][field] = match.group(1).strip()
                break

    # --- Test Results - COVID-19 specific ---
    covid_patterns = [
        r"(?:COVID-19|SARS-CoV-2).*?(?:Result|Interpretation)\s*:?\s*(?:is)?\s*(Positive|Negative|Detected|Not\s*Detected)",
        r"(?:COVID|SARS).{0,30}(Detected|Not\s*Detected|Positive|Negative)",
        r"(Detected|Not\s*Detected|Positive|Negative).{0,30}(?:COVID|SARS)"
    ]

    for pattern in covid_patterns:
        match = re.search(pattern, report_text, re.IGNORECASE)
        if match:
            extracted["test_results"].append({
                "test_name": "COVID-19 RT-PCR",
                "result": match.group(1).strip(),
                "reference_range": "Not Detected",
                "interpretation": "RNA specific to SARS-CoV-2"
            })
            break

    # --- General Test Results ---
    result_section = extract_section(report_text,
                                    ["TEST NAME", "TEST RESULT", "RESULT", "MOLECULAR DIAGNOSTIC"],
                                    ["NOTE", "DISCLAIMER", "SIGNATURE", "END"])

    if result_section:
        test_rows = re.findall(r"([A-Za-z\s\-0-9]+)\s+(?::|]\s+)((?:Positive|Negative|Detected|Not Detected|Reactive|Non-Reactive)[^\n]*)",
                               result_section, re.IGNORECASE)

        for test_name, result in test_rows:
            if not any(test["test_name"].lower() == test_name.lower().strip() for test in extracted["test_results"]):
                extracted["test_results"].append({
                    "test_name": test_name.strip(),
                    "result": result.strip(),
                    "reference_range": "N/A"
                })

    if not extracted["test_results"] and re.search(r"COVID|SARS|PCR|Molecular", report_text, re.IGNORECASE):
        result = "Unknown"
        if re.search(r"Positive|Detected", report_text, re.IGNORECASE):
            result = "Detected/Positive"
        elif re.search(r"Negative|Not Detected", report_text, re.IGNORECASE):
            result = "Not Detected/Negative"

        extracted["test_results"].append({
            "test_name": "COVID-19 Test",
            "result": result,
            "reference_range": "Not Detected",
            "method": "RT-PCR"
        })

    return extracted

def extract_discharge_summary_data(report_text, ner_pipeline=None):
    """Extract data from discharge summaries (FINE-TUNED V3)"""
    print("Processing Discharge Summary Report...")

    extracted = {
        "document_type": "discharge_summary",
        "patient_info": {},
        "facility_info": {},
        "admission_info": {},
        "discharge_info": {},
        "diagnoses": [],
        "medications": [],
        "vitals": [],
        "physical_exam": {},
        "complaints": [],
        "treatments": [],
        "follow_up": "",
        "clinical_course": ""
    }

    # --- Extract patient information (FINE-TUNED V3) ---
    patient_patterns = {
        'name': [r"Full Name\s*:?\s*([A-Za-z\s]+?)(?=\s*Sex|\s*Nationality|\n)"],
        'gender': [r"Sex\s*:?\s*([A-Za-z]+?)(?=\s*Nationality|\n)"],
        # FINE-TUNE V3: Use a specific regex for DD/MM/YYYY format
        'dob': [r"(?:DOB|Date of Birth)\s*:?\s*(\d{2}/\d{2}/\d{4})"],
        'age': [r"Age\s*:?\s*(\d+)"],
        'nationality': [r"Nationality\s*:?\s*([A-Za-z]+?)(?=\n|$)"],
        'address': [r"Address in \w+\s*:?\s*([^\n:]+)(?=\n|$)"]
    }

    for field, patterns in patient_patterns.items():
        for pattern in patterns:
            match = re.search(pattern, report_text, re.IGNORECASE)
            if match:
                extracted["patient_info"][field] = match.group(1).strip()
                break

    # --- Facility information (FINE-TUNED V3) ---
    # FINE-TUNE V3: Search for the specific clinic name anywhere in the document
    facility_match = re.search(r"(Blue\s+Island\s+Medical\s+Clinic)", report_text, re.IGNORECASE)
    if facility_match:
        extracted["facility_info"]["name"] = facility_match.group(1).strip()

    # --- Extract main complaint ---
    complaint_patterns = [
        r"Main Complaint\s*:?\s*([^\n]+)(?=\n|$)",
        r"Presenting Complaint\s*:?\s*([^\n]+)(?=\n|$)"
    ]

    for pattern in complaint_patterns:
        match = re.search(pattern, report_text, re.IGNORECASE)
        if match:
            extracted["complaints"].append(match.group(1).strip())
            break

    # --- Extract vitals (FINE-TUNED V3) ---
    vital_patterns = {
        # Match "BP = 110/70"
        'BP': [r"(?:BP)\s*=\s*(\d+/\d+)"],
        # Match "HR = 96x/mnt"
        'HR': [r"(?:HR)\s*=\s*(\d+\s*x/m[ni]t)"],
        # Match "RR = 24 x/mnt"
        'RR': [r"(?:RR)\s*=\s*(\d+\s*x/m[ni]t)"],
        # Match "T = 36,2" (allow comma or dot)
        'TEMP': [r"(?:T)\s*=\s*([3-4][0-9][.,]\d)"],
        # Match "GCS : E4VSM6"
        'GCS': [r"(?:GCS)\s*:\s*([A-Z0-9V]+)"],
    }
    
    # FINE-TUNE V3: Search the *entire text* for vitals, not just one line.
    for vital, patterns in vital_patterns.items():
        for pattern in patterns:
            matches = re.findall(pattern, report_text, re.IGNORECASE)
            for value in matches:
                # Use a set to avoid duplicate vitals
                if "vitals_set" not in extracted: extracted["vitals_set"] = set()
                if (vital, value) not in extracted["vitals_set"]:
                    extracted["vitals"].append({"name": vital, "value": value})
                    extracted["vitals_set"].add((vital, value))

    # --- Extract clinical course ---
    course_section = extract_section(report_text,
                                     ["Explanation in Details", "Clinical Course", "Hospital Course"],
                                     ["Past Medical", "Physical Examination", "Discharge"])

    if course_section:
        extracted["clinical_course"] = course_section.strip()

    # --- Extract medications from text (FINE-TUNED V3) ---
    # We will rely *only* on NER to avoid duplicates and fragments.
    
    # --- Use BioBERT for entity extraction if available ---
    if ner_pipeline: # This block will now handle all medications
        try:
            # Use the clinical course section, which is rich in entity names
            text_for_ner = course_section if course_section else report_text
            
            ner_results = ner_pipeline(text_for_ner)
            biobert_entities = process_biobert_results(ner_results)

            for entity in biobert_entities:
                if entity["score"] < 0.7:
                    continue

                entity_type = entity["type"]
                entity_text = entity["text"]

                # FINE-TUNE V3: Increase length filter to avoid 'tam', 'para'
                if not entity_text or len(entity_text.strip()) < 5:
                    continue

                if entity_type in ["DISEASE", "PROBLEM", "Disease_disorder"]:
                    if entity_text.lower() not in [d.lower() for d in extracted["diagnoses"]]:
                        extracted["diagnoses"].append(entity_text)

                elif entity_type in ["DRUG", "CHEMICAL", "Medication"]:
                    if entity_text.lower() not in [m.lower() for m in extracted["medications"]]:
                        extracted["medications"].append(entity_text)
        except Exception as e:
            print(f"Error during BioBERT NER for discharge summary: {e}")

    # --- Extract physical examination (FINE-TUNED) ---
    exam_section = extract_section(report_text,
                                   ["Physical Examination"],
                                   ["Past Medical History", "Past Traveling History"])

    if exam_section:
        # FINE-TUNE: Use a more robust regex to capture multi-line values
        system_matches = re.findall(r"(\w+)\s*:\s*([^:\n]+(?:\n(?!\s*[A-Za-z\s]+:)[^:\n]+)*)", exam_section, re.DOTALL)
        
        for system_name, system_finding in system_matches:
            # Skip the vitals we already extracted
            if system_name.upper() in ['BP', 'HR', 'RR', 'T', 'GCS']:
                continue
            
            clean_finding = re.sub(r'\s+', ' ', system_finding).strip()
            
            # Handle nested findings like in Thorax
            if system_name == "Thorax":
                 nested_matches = re.findall(r'(\w+)\s*:\s*([^:]+)(?=\s*\w+:|$)', clean_finding, re.DOTALL)
                 if nested_matches:
                     for sub_name, sub_finding in nested_matches:
                         extracted["physical_exam"][sub_name.strip()] = sub_finding.strip()
                 else:
                     extracted["physical_exam"][system_name] = clean_finding
            else:
                extracted["physical_exam"][system_name] = clean_finding

    # Clean up vitals set
    if "vitals_set" in extracted:
        del extracted["vitals_set"]

    return extracted

def extract_medical_data(report_text, ner_pipeline=None):
    """Main function to detect document type and extract data accordingly"""
    document_type = detect_document_type(report_text)
    print(f"Detected document type: {document_type}")

    if document_type == "clinical_report":
        return extract_clinical_report_data(report_text, ner_pipeline)
    elif document_type == "lab_report":
        return extract_lab_report_data(report_text, ner_pipeline)
    elif document_type == "discharge_summary":
        return extract_discharge_summary_data(report_text, ner_pipeline)
    else:
        # For unknown document types, try clinical report extraction as default
        print("Using general medical extraction for unknown document type")
        return extract_clinical_report_data(report_text, ner_pipeline)

def process_biobert_results(ner_results):
    """Process BioBERT NER results to extract complete entities"""
    entities = []
    current_entity = {"text": "", "type": None, "score": 0}

    for token in ner_results:
        # For B- tokens (beginning of entity)
        if token["entity"].startswith("B-"):
            if current_entity["type"] and current_entity["text"]:
                entities.append(current_entity.copy())

            entity_type = token["entity"].replace("B-", "")
            current_entity = {
                "text": token["word"].replace("##", ""),
                "type": entity_type,
                "score": token["score"]
            }

        # For I- tokens (inside entity)
        elif token["entity"].startswith("I-"):
            entity_type = token["entity"].replace("I-", "")
            if current_entity["type"] == entity_type:
                current_entity["text"] += token["word"].replace("##", "")
                current_entity["score"] = (current_entity["score"] + token["score"]) / 2

        # For O tokens or different entity types, save and reset
        elif current_entity["type"] and current_entity["text"]:
            entities.append(current_entity.copy())
            current_entity = {"text": "", "type": None, "score": 0}

    # Add final entity if it exists
    if current_entity["type"] and current_entity["text"]:
        entities.append(current_entity)

    # Clean entity text
    for entity in entities:
        entity["text"] = entity["text"].strip()

    return entities

def extract_section(text, start_headers, end_headers):
    """Extract a section from text based on start and end headers"""
    if not text:
        return None

    start_pattern = '|'.join(re.escape(header) for header in start_headers)
    end_pattern = '|'.join(re.escape(header) for header in end_headers)

    pattern = rf"(?:{start_pattern})[^a-zA-Z]*(.*?)(?=(?:{end_pattern})|$)"
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)

    if match:
        return match.group(1).strip()

    return None

def safe_search(pattern, text, group=1):
    """Helper function to safely perform regex search."""
    if not text:
        return None

    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if match and len(match.groups()) >= group:
        result = match.group(group).strip()
        return result if result else None
    return None

print("✅ CELL 4 Complete: Fine-Tuned (V3) Medical Document Extraction System defined.")


--- CELL 4: Defining Fine-Tuned Medical Document Extraction System (V3) ---
✅ CELL 4 Complete: Fine-Tuned (V3) Medical Document Extraction System defined.


In [1]:
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [None]:
# --- CELL 5: File Upload & Medical Document Processing (VS Code Friendly - ENHANCED) ---

print("\n--- CELL 5: File Upload & Medical Document Processing (ENHANCED) ---")

# Install required packages (if not installed)
%pip install -q reportlab pdf2image pillow ipywidgets python-dateutil

# --- Imports ---
import os
import json
from datetime import datetime
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.units import inch
from pdf2image import convert_from_path
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import traceback

# --- Utility Functions ---
def get_timestamp():
    """Generate current timestamp in ISO format"""
    return datetime.utcnow().isoformat() + "Z"

def get_readable_date():
    """Generate human-readable date and time"""
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

def log_info(message, output_area=None):
    """Log informational messages"""
    timestamp = get_readable_date()
    log_msg = f"[{timestamp}] ℹ️ {message}"
    print(log_msg)
    if output_area:
        with output_area:
            print(log_msg)

def log_success(message, output_area=None):
    """Log success messages"""
    timestamp = get_readable_date()
    log_msg = f"[{timestamp}] ✅ {message}"
    print(log_msg)
    if output_area:
        with output_area:
            print(log_msg)

def log_error(message, error=None, output_area=None):
    """Log error messages with traceback"""
    timestamp = get_readable_date()
    log_msg = f"[{timestamp}] ❌ ERROR: {message}"
    if error:
        log_msg += f"\n    Details: {str(error)}"
        log_msg += f"\n    Traceback: {traceback.format_exc()}"
    print(log_msg)
    if output_area:
        with output_area:
            print(log_msg)

def log_warning(message, output_area=None):
    """Log warning messages"""
    timestamp = get_readable_date()
    log_msg = f"[{timestamp}] ⚠️ WARNING: {message}"
    print(log_msg)
    if output_area:
        with output_area:
            print(log_msg)

# --- Enhanced PDF Report Creation Function ---
def create_medical_report_pdf(data, output_path):
    """
    Create a comprehensive medical report PDF with all data from JSON.
    
    Args:
        data (dict): Extracted medical data
        output_path (str): Path to save PDF
        
    Returns:
        str: Path to created PDF or None if failed
    """
    try:
        doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=0.5*inch, bottomMargin=0.5*inch)
        styles = getSampleStyleSheet()
        
        # Define custom styles
        title_style = ParagraphStyle(
            'CustomTitle',
            parent=styles['Heading1'],
            fontSize=16,
            textColor=colors.HexColor('#1a3a52'),
            spaceAfter=12,
            alignment=1  # Center
        )
        
        section_style = ParagraphStyle(
            'CustomSection',
            parent=styles['Heading2'],
            fontSize=12,
            textColor=colors.HexColor('#2c5aa0'),
            spaceAfter=10,
            spaceBefore=10,
            borderPadding=5,
            borderColor=colors.HexColor('#2c5aa0'),
            borderWidth=1
        )
        
        normal_style = styles['Normal']
        medical_data_style = ParagraphStyle(
            'MedicalData',
            parent=normal_style,
            fontSize=9,
            leading=12
        )
        
        small_style = ParagraphStyle(
            'Small',
            parent=normal_style,
            fontSize=8,
            textColor=colors.grey,
            leading=10
        )

        content = []
        doc_type = data.get('document_type', 'Medical Document')
        
        # --- HEADER WITH TIMESTAMP ---
        content.append(Paragraph(
            f"{doc_type.upper().replace('_', ' ')} REPORT",
            title_style
        ))
        
        timestamp = data.get('timestamp', get_timestamp())
        readable_date = data.get('readable_date', get_readable_date())
        content.append(Paragraph(
            f"Generated on: {readable_date}",
            small_style
        ))
        content.append(Spacer(1, 0.15*inch))

        # --- PATIENT INFORMATION ---
        pt_info = data.get('patient_info', {})
        if pt_info:
            content.append(Paragraph("👤 PATIENT INFORMATION", section_style))
            patient_data = [["Field", "Value"]]
            
            for key, value in pt_info.items():
                if value:
                    field_name = key.replace('_', ' ').title()
                    patient_data.append([field_name + ":", str(value)])
            
            if len(patient_data) > 1:
                patient_table = Table(patient_data, colWidths=[1.5*inch, 3.5*inch])
                patient_table.setStyle(TableStyle([
                    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#e8f4f8')),
                    ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a3a52')),
                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                    ('FONTSIZE', (0, 0), (-1, 0), 10),
                    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                    ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                    ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f9f9f9')]),
                ]))
                content.append(patient_table)
        
        content.append(Spacer(1, 0.15*inch))

        # --- FACILITY INFORMATION ---
        facility_info = data.get('facility_info', {})
        if facility_info and any(facility_info.values()):
            content.append(Paragraph("🏥 FACILITY INFORMATION", section_style))
            facility_data = [["Field", "Value"]]
            
            for key, value in facility_info.items():
                if value:
                    field_name = key.replace('_', ' ').title()
                    facility_data.append([field_name + ":", str(value)])
            
            if len(facility_data) > 1:
                facility_table = Table(facility_data, colWidths=[1.5*inch, 3.5*inch])
                facility_table.setStyle(TableStyle([
                    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#e8f4f8')),
                    ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a3a52')),
                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                    ('FONTSIZE', (0, 0), (-1, 0), 10),
                    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                    ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                    ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f9f9f9')]),
                ]))
                content.append(facility_table)
            
            content.append(Spacer(1, 0.15*inch))

        # --- ADMISSION INFORMATION ---
        admission_info = data.get('admission_info', {})
        if admission_info and any(admission_info.values()):
            content.append(Paragraph("📋 ADMISSION INFORMATION", section_style))
            admission_data = [["Field", "Value"]]
            
            for key, value in admission_info.items():
                if value:
                    field_name = key.replace('_', ' ').title()
                    admission_data.append([field_name + ":", str(value)])
            
            if len(admission_data) > 1:
                admission_table = Table(admission_data, colWidths=[1.5*inch, 3.5*inch])
                admission_table.setStyle(TableStyle([
                    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#e8f4f8')),
                    ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a3a52')),
                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                    ('FONTSIZE', (0, 0), (-1, 0), 10),
                    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                    ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                    ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f9f9f9')]),
                ]))
                content.append(admission_table)
            
            content.append(Spacer(1, 0.15*inch))

        # --- DISCHARGE INFORMATION ---
        discharge_info = data.get('discharge_info', {})
        if discharge_info and any(discharge_info.values()):
            content.append(Paragraph("🚪 DISCHARGE INFORMATION", section_style))
            discharge_data = [["Field", "Value"]]
            
            for key, value in discharge_info.items():
                if value:
                    field_name = key.replace('_', ' ').title()
                    discharge_data.append([field_name + ":", str(value)])
            
            if len(discharge_data) > 1:
                discharge_table = Table(discharge_data, colWidths=[1.5*inch, 3.5*inch])
                discharge_table.setStyle(TableStyle([
                    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#e8f4f8')),
                    ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#1a3a52')),
                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                    ('FONTSIZE', (0, 0), (-1, 0), 10),
                    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                    ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                    ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f9f9f9')]),
                ]))
                content.append(discharge_table)
            
            content.append(Spacer(1, 0.15*inch))

        # --- CHIEF COMPLAINTS ---
        complaints = data.get('complaints', [])
        if complaints:
            content.append(Paragraph("🔴 CHIEF COMPLAINTS", section_style))
            for complaint in complaints:
                content.append(Paragraph(f"• {complaint}", medical_data_style))
            content.append(Spacer(1, 0.15*inch))

        # --- DIAGNOSES ---
        diagnoses = data.get('diagnoses', [])
        if diagnoses:
            content.append(Paragraph("🩺 DIAGNOSES", section_style))
            for diagnosis in diagnoses:
                content.append(Paragraph(f"• {diagnosis}", medical_data_style))
            content.append(Spacer(1, 0.15*inch))

        # --- VITAL SIGNS ---
        vitals = data.get('vitals', [])
        if vitals:
            content.append(Paragraph("❤️ VITAL SIGNS", section_style))
            vitals_data = [["Measurement", "Value"]]
            for vital in vitals:
                vitals_data.append([vital.get('name', 'N/A'), vital.get('value', 'N/A')])
            
            vitals_table = Table(vitals_data, colWidths=[2.5*inch, 2.5*inch])
            vitals_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#ffe8e8')),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.HexColor('#cc0000')),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('FONTSIZE', (0, 0), (-1, 0), 10),
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
                ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#fff5f5')]),
            ]))
            content.append(vitals_table)
            content.append(Spacer(1, 0.15*inch))

        # --- PHYSICAL EXAMINATION ---
        physical_exam = data.get('physical_exam', {})
        if physical_exam and any(physical_exam.values()):
            content.append(Paragraph("👁️ PHYSICAL EXAMINATION", section_style))
            for key, value in physical_exam.items():
                if value:
                    field_name = key.replace('_', ' ').title()
                    content.append(Paragraph(
                        f"<b>{field_name}:</b> {value}",
                        medical_data_style
                    ))
            content.append(Spacer(1, 0.15*inch))

        # --- MEDICATIONS ---
        medications = data.get('medications', [])
        if medications:
            content.append(Paragraph("💊 MEDICATIONS", section_style))
            for medication in medications:
                content.append(Paragraph(f"• {medication}", medical_data_style))
            content.append(Spacer(1, 0.15*inch))

        # --- TREATMENTS ---
        treatments = data.get('treatments', [])
        if treatments:
            content.append(Paragraph("🏥 TREATMENTS", section_style))
            for treatment in treatments:
                content.append(Paragraph(f"• {treatment}", medical_data_style))
            content.append(Spacer(1, 0.15*inch))

        # --- CLINICAL COURSE ---
        clinical_course = data.get('clinical_course', '')
        if clinical_course:
            content.append(Paragraph("📖 CLINICAL COURSE", section_style))
            # Split clinical course into paragraphs for better readability
            paragraphs = clinical_course.split('\n\n')
            for para in paragraphs:
                if para.strip():
                    content.append(Paragraph(para.strip(), medical_data_style))
                    content.append(Spacer(1, 0.08*inch))
            content.append(Spacer(1, 0.15*inch))

        # --- FOLLOW-UP ---
        follow_up = data.get('follow_up', '')
        if follow_up:
            content.append(Paragraph("📅 FOLLOW-UP", section_style))
            content.append(Paragraph(follow_up, medical_data_style))
            content.append(Spacer(1, 0.15*inch))

        # --- FOOTER WITH METADATA ---
        content.append(Spacer(1, 0.25*inch))
        footer_text = f"Document Type: {doc_type.replace('_', ' ').title()} | " \
                      f"Generated: {readable_date} | " \
                      f"Report ID: {data.get('report_id', 'N/A')}"
        content.append(Paragraph(footer_text, small_style))

        # Build PDF
        doc.build(content)
        return output_path

    except Exception as e:
        log_error(f"Failed to create PDF report: {str(e)}", e)
        return None

# --- File Upload Widget ---
print("\n📁 Upload a medical document (PDF, PNG, JPG, JPEG, WEBP):")
upload_widget = widgets.FileUpload(accept=".pdf,.png,.jpg,.jpeg,.webp", multiple=False)
display(upload_widget)

# --- Output Area ---
output_area = widgets.Output()
display(output_area)

# --- Process Button ---
def process_uploaded_file(btn):
    """Process uploaded medical document"""
    with output_area:
        clear_output()
        
        try:
            log_info("=== MEDICAL DOCUMENT PROCESSING STARTED ===", output_area)
            
            if not upload_widget.value:
                log_warning("No file uploaded yet. Please select a file.", output_area)
                return

            uploaded_file = upload_widget.value[0]
            file_name = uploaded_file.name
            content = uploaded_file.content

            log_info(f"Processing file: {file_name}", output_area)

            # Save file locally
            try:
                with open(file_name, "wb") as f:
                    f.write(content)
                log_success(f"File saved locally: {file_name}", output_area)
            except Exception as e:
                log_error(f"Failed to save file locally", e, output_area)
                return

            # Convert PDF to image if needed
            image_path = file_name
            if file_name.lower().endswith(".pdf"):
                log_info("PDF detected, converting to image...", output_area)
                output_folder = "pdf_images"
                try:
                    os.makedirs(output_folder, exist_ok=True)
                    images = convert_from_path(file_name, dpi=300, fmt="png", output_folder=output_folder)
                    
                    if images:
                        image_path = images[0] if isinstance(images[0], str) else images[0].filename
                        log_success(f"PDF converted successfully. Using first page: {image_path}", output_area)
                    else:
                        log_error("PDF conversion returned no images", None, output_area)
                        return
                        
                except Exception as e:
                    log_error(f"PDF conversion failed", e, output_area)
                    image_path = file_name

            # Run OCR & extraction
            try:
                log_info("Running OCR on document...", output_area)
                ocr_text = run_medical_ocr(image_path)
                log_success("OCR completed successfully", output_area)

                log_info("Extracting medical data from OCR text...", output_area)
                extracted = extract_medical_data(ocr_text)
                
                # Add timestamps to extracted data
                extracted['timestamp'] = get_timestamp()
                extracted['readable_date'] = get_readable_date()
                extracted['report_id'] = f"MR-{datetime.now().strftime('%Y%m%d%H%M%S')}"
                
                log_success("Medical data extracted successfully", output_area)

                # Save OCR text
                ocr_text_path = f"ocr_text_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
                try:
                    with open(ocr_text_path, "w", encoding="utf-8") as f:
                        f.write(ocr_text)
                    log_success(f"OCR text saved: {ocr_text_path}", output_area)
                except Exception as e:
                    log_error(f"Failed to save OCR text", e, output_area)

                # Save extracted JSON with timestamp
                json_path = f"extracted_medical_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                try:
                    with open(json_path, "w", encoding="utf-8") as f:
                        json.dump(extracted, f, indent=2, ensure_ascii=False)
                    log_success(f"Extracted JSON saved: {json_path}", output_area)
                    
                    # Display JSON content in output
                    log_info("\n📊 EXTRACTED JSON DATA:", output_area)
                    with output_area:
                        print(json.dumps(extracted, indent=2, ensure_ascii=False))
                except Exception as e:
                    log_error(f"Failed to save extracted JSON", e, output_area)

                # Generate PDF report
                pdf_report_path = f"medical_report_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
                try:
                    result = create_medical_report_pdf(extracted, pdf_report_path)
                    if result:
                        log_success(f"PDF report generated: {pdf_report_path}", output_area)
                    else:
                        log_error("PDF report generation returned None", None, output_area)
                except Exception as e:
                    log_error(f"Failed to generate PDF report", e, output_area)

                log_info("=== PROCESSING COMPLETED SUCCESSFULLY ===", output_area)
                log_info(f"✨ All files generated at: {get_readable_date()}", output_area)

            except Exception as e:
                log_error(f"OCR or extraction process failed", e, output_area)
                return

        except Exception as e:
            log_error(f"Unexpected error during processing", e, output_area)
            return

process_btn = widgets.Button(description="Process Uploaded File", button_style='success', tooltip='Click to process the uploaded medical document')
process_btn.on_click(process_uploaded_file)
display(process_btn)

print("\n✅ Medical document processor initialized successfully!")
print("📋 Ready to process medical documents with timestamps and complete data extraction.")


--- CELL 5: File Upload & Medical Document Processing (ENHANCED) ---
Note: you may need to restart the kernel to use updated packages.

📁 Upload a medical document (PDF, PNG, JPG, JPEG, WEBP):


FileUpload(value=(), accept='.pdf,.png,.jpg,.jpeg,.webp', description='Upload')

Output()

Button(button_style='success', description='Process Uploaded File', style=ButtonStyle(), tooltip='Click to pro…


✅ Medical document processor initialized successfully!
📋 Ready to process medical documents with timestamps and complete data extraction.
