In [1]:
import pandas as pd
import os
import argparse
import pandas as pd
import yaml
import random
random.seed(49297)
from tqdm import tqdm


In [2]:
CLASSES = [
       'Acute and unspecified renal failure', 'Acute cerebrovascular disease',
       'Acute myocardial infarction', 'Cardiac dysrhythmias',
       'Chronic kidney disease',
       'Chronic obstructive pulmonary disease and bronchiectasis',
       'Complications of surgical procedures or medical care',
       'Conduction disorders', 'Congestive heart failure; nonhypertensive',
       'Coronary atherosclerosis and other heart disease',
       'Diabetes mellitus with complications',
       'Diabetes mellitus without complication',
       'Disorders of lipid metabolism', 'Essential hypertension',
       'Fluid and electrolyte disorders', 'Gastrointestinal hemorrhage',
       'Hypertension with complications and secondary hypertension',
       'Other liver diseases', 'Other lower respiratory disease',
       'Other upper respiratory disease',
       'Pleurisy; pneumothorax; pulmonary collapse',
       'Pneumonia (except that caused by tuberculosis or sexually transmitted disease)',
       'Respiratory failure; insufficiency; arrest (adult)',
       'Septicemia (except in labor)', 'Shock'
    ]


phenotype_definitions = '/scratch/fs999/shamoutlab/Farah/MedFuse/mimic4extract/mimic3benchmark/resources/icd_9_10_definitions_2.yaml'

with open(phenotype_definitions) as definitions_file:
    definitions = yaml.load(definitions_file,Loader=yaml.Loader)
    
code_to_group = {}
   
for group in definitions:
    codes = definitions[group]['codes']
    for code in codes:
        if code not in code_to_group:
            code_to_group[code] = group
        else:
            #print(f'code, {code}')
            assert code_to_group[code] == group

id_to_group = sorted(definitions.keys())
group_to_id = dict((x, i) for (i, x) in enumerate(id_to_group))

In [6]:
cxr_metadata

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,
...,...,...,...,...,...,...,...,...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,19999733,57132437,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [3]:
structure = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv')

cxr_metadata = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv')

adm = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimiciv/0.4/core/admissions.csv')

diag = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimiciv/0.4/hosp/diagnoses_icd.csv')


In [21]:
icu_stays = pd.read_csv('/scratch/fs999/shamoutlab/data/mimic-iv-extracted/root/all_stays.csv')

In [23]:
icu_stays.loc[icu_stays.subject_id==17332963]

Unnamed: 0,subject_id,hadm_id,stay_id,last_careunit,intime,outtime,los,admittime,dischtime,deathtime,ethnicity,gender,anchor_age,dod,age,mortality_inunit,mortality,mortality_inhospital
2187,17332963,25958837,31798293,Medical Intensive Care Unit (MICU),2205-07-27 21:03:00,2205-07-29 03:30:10,1.268866,2205-07-27 19:16:00,2205-07-28 00:00:00,2205-07-28 23:15:00,WHITE,M,91,2205-07-28 00:00:00,91,1,1,1


In [8]:
diag['icd_code'] = diag['icd_code'].str.replace(" ", "")
diag['icd_group'] = diag.icd_code.apply(lambda x: code_to_group[str(x)] if str(x) in code_to_group.keys() else str(x)  )
diag_pheno = diag.loc[diag.icd_group.apply(lambda x: x in CLASSES)]
diag_pheno['value']=1
diag_pheno = diag_pheno.pivot_table(index='hadm_id', columns='icd_group', values='value', aggfunc='mean').reset_index().fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diag_pheno['value']=1


In [9]:
diag_pheno

icd_group,hadm_id,Acute and unspecified renal failure,Acute cerebrovascular disease,Acute myocardial infarction,Cardiac dysrhythmias,Chronic kidney disease,Chronic obstructive pulmonary disease and bronchiectasis,Complications of surgical procedures or medical care,Conduction disorders,Congestive heart failure; nonhypertensive,...,Gastrointestinal hemorrhage,Hypertension with complications and secondary hypertension,Other liver diseases,Other lower respiratory disease,Other upper respiratory disease,Pleurisy; pneumothorax; pulmonary collapse,Pneumonia (except that caused by tuberculosis or sexually transmitted disease),Respiratory failure; insufficiency; arrest (adult),Septicemia (except in labor),Shock
0,20000019,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20000024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20000034,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20000057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293536,29999625,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
293537,29999670,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293538,29999809,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
293539,29999828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
# Link CXR with admission ID
print(len(cxr_metadata))

# combine study date time
cxr_metadata['StudyTime'] = cxr_metadata['StudyTime'].apply(lambda x: f'{int(float(x)):06}' )
cxr_metadata['StudyDateTime'] = pd.to_datetime(cxr_metadata['StudyDate'].astype(str) + ' ' + cxr_metadata['StudyTime'].astype(str) ,format="%Y%m%d %H%M%S")

# Merge with admissions and keep linked admission if datetimes match
cxr_merged = cxr_metadata.merge(adm[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id', how='left')
print(len(cxr_merged))
print(cxr_merged.loc[cxr_merged.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d'])
cxr_merged = cxr_merged.loc[(cxr_merged.StudyDateTime>=cxr_merged.admittime)&((cxr_merged.StudyDateTime<=cxr_merged.dischtime))]
print(len(cxr_merged))
print(cxr_merged.loc[cxr_merged.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d'])

# Drop duplicate dicom ids
cxr_merged = cxr_merged.drop_duplicates('dicom_id')
print(len(cxr_merged))


377110
2666533
                                             dicom_id  subject_id  study_id  \
1950489  46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d    17332963  52326595   

        PerformedProcedureStepDescription ViewPosition  Rows  Columns  \
1950489               CHEST (PORTABLE AP)           AP  3050     2539   

         StudyDate StudyTime ProcedureCodeSequence_CodeMeaning  \
1950489   22050728    005735               CHEST (PORTABLE AP)   

        ViewCodeSequence_CodeMeaning  \
1950489             antero-posterior   

        PatientOrientationCodeSequence_CodeMeaning       StudyDateTime  \
1950489                                        NaN 2205-07-28 00:57:35   

            hadm_id            admittime            dischtime  
1950489  25958837.0  2205-07-27 19:16:00  2205-07-28 00:00:00  
154303
Empty DataFrame
Columns: [dicom_id, subject_id, study_id, PerformedProcedureStepDescription, ViewPosition, Rows, Columns, StudyDate, StudyTime, ProcedureCodeSequence_CodeMeaning, Vi

In [16]:
print(cxr_merged.loc[cxr_merged.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d'])

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,StudyDateTime,hadm_id,admittime,dischtime


In [11]:
cxr_diag = cxr_merged[['subject_id', 'study_id', 'hadm_id']].merge(diag_pheno, how='left', on='hadm_id')

In [12]:
temp1 = cxr_merged['hadm_id'].unique()
temp2 = diag['hadm_id'].unique()
id_exclude = (list(set(temp1) - set(temp2))) # patients with no ICD codes because they are mostly emergency / not sure why else
cxr_diag = cxr_diag.loc[cxr_diag.hadm_id.apply(lambda x: x not in id_exclude)]

In [13]:
# If they had no diagnosis of all 25, fill them with 0
cxr_diag = cxr_diag.fillna(0)

In [14]:
cxr_diag.loc[cxr_diag.dicom_id=='46c472ab-a5398f59-80ea7ac9-1f1bc81e-de63f39d']

AttributeError: 'DataFrame' object has no attribute 'dicom_id'

In [150]:
cxr_diag.to_csv('/scratch/fs999/shamoutlab/data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pheno.csv')

In [146]:
split = pd.read_csv('/scratch/fs999/shamoutlab/Nasir/data/shamoutlab/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-ehr-split.csv').study_id.unique()

227835

In [149]:
(list(set(cxr_diag.study_id.unique()) - set(split)))

[]