In [36]:
import pandas as pd
import os
import argparse
import pandas as pd
import yaml
import random
random.seed(49297)
from tqdm import tqdm


In [37]:
CLASSES = [
       'Acute and unspecified renal failure', 'Acute cerebrovascular disease',
       'Acute myocardial infarction', 'Cardiac dysrhythmias',
       'Chronic kidney disease',
       'Chronic obstructive pulmonary disease and bronchiectasis',
       'Complications of surgical procedures or medical care',
       'Conduction disorders', 'Congestive heart failure; nonhypertensive',
       'Coronary atherosclerosis and other heart disease',
       'Diabetes mellitus with complications',
       'Diabetes mellitus without complication',
       'Disorders of lipid metabolism', 'Essential hypertension',
       'Fluid and electrolyte disorders', 'Gastrointestinal hemorrhage',
       'Hypertension with complications and secondary hypertension',
       'Other liver diseases', 'Other lower respiratory disease',
       'Other upper respiratory disease',
       'Pleurisy; pneumothorax; pulmonary collapse',
       'Pneumonia (except that caused by tuberculosis or sexually transmitted disease)',
       'Respiratory failure; insufficiency; arrest (adult)',
       'Septicemia (except in labor)', 'Shock'
    ]


phenotype_definitions = '/scratch/fs999/shamoutlab/Farah/MedFuse/mimic4extract/mimic3benchmark/resources/icd_9_10_definitions_2.yaml'

with open(phenotype_definitions) as definitions_file:
    definitions = yaml.load(definitions_file,Loader=yaml.Loader)
    
code_to_group = {}
   
for group in definitions:
    codes = definitions[group]['codes']
    for code in codes:
        if code not in code_to_group:
            code_to_group[code] = group
        else:
            #print(f'code, {code}')
            assert code_to_group[code] == group

id_to_group = sorted(definitions.keys())
group_to_id = dict((x, i) for (i, x) in enumerate(id_to_group))

In [38]:
cxr_metadata

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014,CHEST (PA AND LAT),postero-anterior,Erect,2180-05-06 21:30:14
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014,CHEST (PA AND LAT),lateral,Erect,2180-05-06 21:30:14
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500,CHEST (PA AND LAT),postero-anterior,Erect,2180-06-26 16:55:00
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500,CHEST (PA AND LAT),lateral,Erect,2180-06-26 16:55:00
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,080556,CHEST (PORTABLE AP),antero-posterior,,2180-07-23 08:05:56
...,...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550,CHEST (PA AND LAT),postero-anterior,Erect,2152-07-08 22:45:50
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550,CHEST (PA AND LAT),lateral,Erect,2152-07-08 22:45:50
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,051448,CHEST (PORTABLE AP),antero-posterior,Erect,2145-11-04 05:14:48
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809,CHEST (PORTABLE AP),antero-posterior,Erect,2145-11-02 20:28:09


In [39]:
structure = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv')

cxr_metadata = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv')

adm = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimiciv/0.4/core/admissions.csv')

diag = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimiciv/0.4/hosp/diagnoses_icd.csv')


In [40]:
icu_stays = pd.read_csv('/scratch/fs999/shamoutlab/data/mimic-iv-extracted/root/all_stays.csv')

In [41]:
icu_stays.loc[icu_stays.subject_id==17332963]

Unnamed: 0,subject_id,hadm_id,stay_id,last_careunit,intime,outtime,los,admittime,dischtime,deathtime,ethnicity,gender,anchor_age,dod,age,mortality_inunit,mortality,mortality_inhospital
2187,17332963,25958837,31798293,Medical Intensive Care Unit (MICU),2205-07-27 21:03:00,2205-07-29 03:30:10,1.268866,2205-07-27 19:16:00,2205-07-28 00:00:00,2205-07-28 23:15:00,WHITE,M,91,2205-07-28 00:00:00,91,1,1,1


In [44]:
# Link CXR with admission ID
print(len(cxr_metadata))

# combine study date time
cxr_metadata['StudyTime'] = cxr_metadata['StudyTime'].apply(lambda x: f'{int(float(x)):06}' )
cxr_metadata['StudyDateTime'] = pd.to_datetime(cxr_metadata['StudyDate'].astype(str) + ' ' + cxr_metadata['StudyTime'].astype(str) ,format="%Y%m%d %H%M%S")

# Merge with admissions and keep linked admission if datetimes match
cxr_merged = cxr_metadata.merge(adm[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime']], on='subject_id', how='left')
### Alex: ADDED DEATHTIME to the previous merge

print(len(cxr_merged))
print(cxr_merged.loc[cxr_merged.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d'])
cxr_merged = cxr_merged.loc[(cxr_merged.StudyDateTime>=cxr_merged.admittime)&((cxr_merged.StudyDateTime<=cxr_merged.dischtime))]
print(len(cxr_merged))
print(cxr_merged.loc[cxr_merged.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d'])

# Drop duplicate dicom ids
cxr_merged = cxr_merged.drop_duplicates('dicom_id')
print(len(cxr_merged))


377110
2666533
                                             dicom_id  subject_id  study_id  \
1950489  46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d    17332963  52326595   

        PerformedProcedureStepDescription ViewPosition  Rows  Columns  \
1950489               CHEST (PORTABLE AP)           AP  3050     2539   

         StudyDate StudyTime ProcedureCodeSequence_CodeMeaning  \
1950489   22050728    005735               CHEST (PORTABLE AP)   

        ViewCodeSequence_CodeMeaning  \
1950489             antero-posterior   

        PatientOrientationCodeSequence_CodeMeaning       StudyDateTime  \
1950489                                        NaN 2205-07-28 00:57:35   

            hadm_id            admittime            dischtime  \
1950489  25958837.0  2205-07-27 19:16:00  2205-07-28 00:00:00   

                   deathtime  
1950489  2205-07-28 23:15:00  
154303
Empty DataFrame
Columns: [dicom_id, subject_id, study_id, PerformedProcedureStepDescription, ViewPosition, Rows, Col

In [45]:
mortality = cxr_merged.deathtime.notnull() & ((cxr_merged.admittime <= cxr_merged.deathtime) & (cxr_merged.dischtime >= cxr_merged.deathtime))
mortality = mortality | (cxr_merged.deathtime.notnull() & ((cxr_merged.admittime <= cxr_merged.deathtime) & (cxr_merged.dischtime >= cxr_merged.deathtime)))

In [46]:
cxr_merged['mortality_inhospital'] = mortality.astype(int)

In [48]:
cxr_merged.mortality_inhospital.value_counts()

0    139764
1     14536
Name: mortality_inhospital, dtype: int64

In [50]:
cxr_merged

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,hadm_id,admittime,dischtime,deathtime,mortality_inhospital
27,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,CHEST (PORTABLE AP),AP,3056,2544,21800805,234424,CHEST (PORTABLE AP),antero-posterior,,2180-08-05 23:44:24,25742920.0,2180-08-05 23:44:00,2180-08-07 17:50:00,,0
28,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,10000764,57375967,CHEST (PA AND LAT),AP,2544,3056,21321015,084047,CHEST (PA AND LAT),antero-posterior,Erect,2132-10-15 08:40:47,27897940.0,2132-10-14 23:31:00,2132-10-19 16:30:00,,0
29,b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f,10000764,57375967,CHEST (PA AND LAT),LATERAL,3056,2544,21321015,084047,CHEST (PA AND LAT),lateral,Erect,2132-10-15 08:40:47,27897940.0,2132-10-14 23:31:00,2132-10-19 16:30:00,,0
30,dcfeeac4-1597e318-d0e6736a-8b2c2238-47ac3f1b,10000764,57375967,CHEST (PA AND LAT),LATERAL,3056,2544,21321015,084047,CHEST (PA AND LAT),lateral,Erect,2132-10-15 08:40:47,27897940.0,2132-10-14 23:31:00,2132-10-19 16:30:00,,0
40,d0b71acc-b5a62046-bbb5f6b8-7b173b85-65cdf738,10000935,50578979,CHEST (PORTABLE AP),AP,2870,2402,21871016,123945,CHEST (PORTABLE AP),antero-posterior,Erect,2187-10-16 12:39:45,25849114.0,2187-10-10 19:09:00,2187-10-26 17:00:00,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666519,2eb70dfe-52fa728e-a36e09be-ec0ed3cf-0a2ea7f0,19999287,58938059,CHEST (PORTABLE AP),AP,3056,2544,21970805,093746,CHEST (PORTABLE AP),antero-posterior,Erect,2197-08-05 09:37:46,20175828.0,2197-08-03 20:58:00,2197-08-18 15:37:00,,0
2666524,ee9155f3-944c056b-c76c73d0-3f792f2c-92ae461e,19999442,58497551,CHEST (PORTABLE AP),AP,2544,3056,21481128,133244,CHEST (PORTABLE AP),antero-posterior,,2148-11-28 13:32:44,26785317.0,2148-11-19 10:00:00,2148-12-04 16:25:00,,0
2666526,16b6c70f-6d36bd77-89d2fef4-9c4b8b0a-79c69135,19999442,58708861,CHEST (PORTABLE AP),AP,2544,3056,21481119,224703,CHEST (PORTABLE AP),antero-posterior,Erect,2148-11-19 22:47:03,26785317.0,2148-11-19 10:00:00,2148-12-04 16:25:00,,0
2666530,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,051448,CHEST (PORTABLE AP),antero-posterior,Erect,2145-11-04 05:14:48,23865745.0,2145-11-02 21:38:00,2145-11-11 12:57:00,,0


In [53]:
# cxr_merged.to_csv('/scratch/fs999/shamoutlab/data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-mortality.csv')