In [20]:
# Get Jupyter Memory Usage

import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ["In", "Out", "exit", "quit", "get_ipython", "ipython_vars"]

# Get a sorted list of the objects and their sizes
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}

print(f'Jupyter memory usage: {sum(mem.values()) / 1e9:.2f} GB')

Jupyter memory usage: 0.02 GB


# 0. Initiate

## Paths

In [4]:
# autoreload import module on change (does not work with from x import y)
%load_ext autoreload
%autoreload 2

In [5]:
# Global paths

# Raw data
mimic_iv_path = '/n/scratch/users/a/ays124/mimiciv/'
icu_stays_path = mimic_iv_path + 'icu/icustays.csv.gz'
chart_events_path = mimic_iv_path + 'icu/chartevents.csv.gz'
patients_table_path = mimic_iv_path + 'hosp/patients.csv.gz'
admissions_table_path = mimic_iv_path + 'hosp/admissions.csv.gz'
lab_events_path = mimic_iv_path + 'hosp/labevents.csv.gz'

mimic_cxr_path = '/n/scratch/users/a/ays124/mimic-cxr-jpg/'
cxr_records_path = mimic_cxr_path + 'cxr-record-list.csv.gz'
cxr_metadata_path = mimic_cxr_path + 'mimic-cxr-2.0.0-metadata.csv.gz'
df_split_path = mimic_cxr_path + 'mimic-cxr-2.0.0-split.csv.gz'
negbio_path = mimic_cxr_path + 'mimic-cxr-2.0.0-negbio.csv.gz'
chexpert_path = mimic_cxr_path + 'mimic-cxr-2.0.0-chexpert.csv.gz'



# Features intermediate 
feature_folder = 'CardiomegalyBiomarkers/Cardiomegaly_Classification/MIMIC_features/'

# MIMIC intermediate 
relevant_chart_events_save_path = feature_folder + 'RelevantChartEvents.pkl'
relevant_lab_events_save_path = feature_folder + 'RelevantLabEvents.pkl'
df_icu_xray_path =  feature_folder + 'IcuXrayMatched.pkl'

# Biomarkers
ctr_path =  'CardiomegalyBiomarkers/Biomarker_Extraction/save_folder/CTR/CTRs.csv'
cpar_path = 'CardiomegalyBiomarkers/Biomarker_Extraction/save_folder/CPAR/CPARs.csv'

# Final cleaned features
features_path = feature_folder + 'MIMIC_features_v2.pkl'

##  Parameters

In [6]:
# General Parameters
label = 'Cardiomegaly'  # Define label of target disease ('Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices')
chunk_size = 10**7      # When extracting relevant lab and chart events we iterate through the original files in chunks of size 'chunk_size'.

# MIMIC-CXR (imaging) Parameters
view = None     # Choose the X-ray view position you're interested in, AP or PA (or None)

# MIMIC-IV (non-imaging) Parameters
MIMIC_IV_version = 3            # Version of MIMIC-IV downloaded
days_before_icu = 365           # The number of days before ICU admission that we look for x-rays in
xray_gap_after_icu = 0          # You can choose to include a 'gap' after ICU discharge in which you don't look for any X-rays
xray_max_time_after_icu = 90    # If you don't want a gap, xray_max_time_after_icu is just the number of days after ICU discharge that we look for x-rays in. We look for x-rays which are between Gap and Gap + xray_max_time_after_icu days after out-time
average_by = 'Stay'             # 'Hourly' to average readings every hour and have one hour per row; 'Stay', to average chart and lab values across a stay
filter_col = 'itemid'   	    # Define features to use for time-series prep

In [7]:
# Labels of desiered non-imaging features

# Lables
chart_labels_mean = {
    220045: 'HR_mean',
    220277: 'SpO2_mean',
    223761: 'Temp(F)_mean',
    220210: 'RR_mean',
    220052: 'ABPm_mean',
    220051: 'ABPd_mean',
    220050: 'ABPs_mean',
    220180: 'NBPd_mean',
    220181: 'NBPm_mean',
    220179: 'NBPs_mean',
    223835: 'FiO2_mean',
    220274: 'PH_mean',
    220235: 'PCO2_mean',
    220227: 'SaO2_mean',
    227457: 'PlateletCount_mean',
    227456: 'Albumin_mean',
    220603: 'Cholesterol_mean',
    220645: 'Sodium_mean',
    220224: 'PO2_mean',
}

chart_labels_max = {
    220045: 'HR_max',
    220210: 'RR_max',
    220052: 'ABPm_max',
    220051: 'ABPd_max',
    220050: 'ABPs_max',
    220180: 'NBPd_max',
    220181: 'NBPm_max',
    220179: 'NBPs_max',
    223835: 'FiO2_max',
    220235: 'PCO2_max',
    220645: 'Sodium_max',
}

chart_labels_min = {
    220045: 'HR_min',
    220277: 'SpO2_min',
    220210: 'RR_min',
    220052: 'ABPm_min',
    220051: 'ABPd_min',
    220050: 'ABPs_min',
    220180: 'NBPd_min',
    220181: 'NBPm_min',
    220179: 'NBPs_min',
    220235: 'PCO2_min',
    220645: 'Sodium_min',
}

lab_labels_mean = {
    50826: 'Tidal_Volume_mean',
    51006: 'Urea_Nitrogren_mean',
    50863: 'Alkaline_Phosphatase_mean',
    50893: 'Calcium_Total_mean',
    50902: 'Chloride_mean',
    50931: 'Glucose_mean',
    50813: 'Lactate_mean',
    50960: 'Magnesium_mean',
    50970: 'Phosphate_mean',
    50971: 'Potassium_mean',
    50885: 'Bilirubin',
    51003: 'Troponin-T_mean',
    51221: 'Hematocrit_mean',
    50811: 'Hemoglobin_mean',
    50861: 'ALT_mean',
    50912: 'Creatinine_mean',
    51275: 'PTT_mean',
    51516: 'WBC_mean',
    51214: 'Fibrinogen',
}

lab_labels_max = {
    50971: 'Potassium_max',
    51003: 'Troponin-T_max',
    50811: 'Hemoglobin_max',
    51516: 'WBC_max',
}

lab_labels_min = {
    50971: 'Potassium_min',
    50811: 'Hemoglobin_min',
    51516: 'WBC_min',
}

# Aggregation of all laboratory items into LabItems
LabItems = dict(lab_labels_mean)
LabItems.update(lab_labels_max)
LabItems.update(lab_labels_min)

# Aggregation of the vital signs / chart items into ChartItems
ChartItems = dict(chart_labels_mean)
ChartItems.update(chart_labels_max)
ChartItems.update(chart_labels_min)

# 4. Descriptive analysis

In [1]:
# Import functions
import pandas as pd

# Import cleaned master dataframe
feature_folder = 'CardiomegalyBiomarkers/Cardiomegaly_Classification/MIMIC_features/'
df_master_cleaned = pd.read_pickle(feature_folder + 'MIMIC_features_v3.pkl')

In [2]:
df_master_cleaned

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,Match,study_id,...,Hematocrit_mean,PTT_mean,WBC_mean,Hemoglobin_max,Potassium_max,Troponin-T_max,WBC_max,Hemoglobin_min,Potassium_min,WBC_min
0,10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.410266,1,53911762,...,33.450000,35.900000,,,5.2,,,,4.7,
1,10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.497535,1,51967283,...,26.783333,47.400000,,,4.5,0.15,,,4.2,
2,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,1,52067803,...,35.200000,33.650000,,,4.2,,,,3.6,
3,10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.948113,1,58913004,...,37.900000,32.550000,,,4.3,,,,4.1,
4,10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817,1,57156853,...,29.436842,58.061765,12.0,12.1,7.1,0.08,21.0,8.6,3.6,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22977,19999068,21606769,30143796,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2161-08-24 05:26:00,2161-08-30 23:48:04,6.765324,1,52434977,...,38.112500,31.500000,,,4.2,,,,3.1,
22978,19999287,20175828,35165301,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2197-08-04 00:02:00,2197-08-08 16:58:17,4.705752,1,53255195,...,34.771429,29.200000,,12.0,4.3,,,12.0,3.7,
22979,19999287,22997012,37692584,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2197-07-26 03:31:06,2197-07-27 16:07:29,1.525266,1,51885769,...,33.975000,,28.5,,4.3,,56.0,,3.8,1.0
22980,19999442,26785317,32336619,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2148-11-19 14:23:43,2148-11-26 13:12:15,6.950370,1,58708861,...,38.138462,26.140000,58.0,,4.9,,58.0,,3.6,58.0


In [11]:
# Snippet to remove all rows with 'Uncertain' or 'Disagreement' values from the df
#mask = ~df.apply(
#   lambda row: row.isin(['Uncertain', 'Disagreement']).any(), axis=1
#)
#df = df[mask]

In [9]:
indexing_cols = ['subject_id', 'study_id']
imaging_cols  = ['ViewPosition', 'path']
icu_cols = ['hadm_id', 'stay_id', 'first_careunit', 'last_careunit', 'intime', 'outtime', 'los', 'Match',\
    'EarlyBoundary', 'PostGapStart', 'PostGapStop']
label_cols = ['split', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', \
    'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',\
    'Fracture', 'Support Devices']
demographic_cols = ['ethnicity', 'anchor_age', 'anchor_year', 'gender']
chart_labels_mean_cols = list(chart_labels_mean.values())
chart_labels_max_cols  = list(chart_labels_max.values())
chart_labels_min_cols  = list(chart_labels_min.values())
lab_labels_mean_cols = list(lab_labels_mean.values())
lab_labels_max_cols  = list(lab_labels_max.values())
lab_labels_min_cols  = list(lab_labels_min.values())

In [8]:
df_master_cleaned[label_cols]

Unnamed: 0,split,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,train,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,train,,,,,1,,,1,,,,,,
2,train,,,,,,,,,1,,,,,
3,train,,,Disagreement,,,,,,,Disagreement,,,,Disagreement
4,train,,,,,,1,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22977,train,1,0,0,0,0,0,0,0,0,0,0,0,0,0
22978,train,,,,1,1,,,,,,,,,
22979,train,,,,,1,,,,1,,Uncertain,,,
22980,train,1,0,0,0,0,0,0,0,0,0,0,0,0,1


### Chart Values

In [10]:
df_master_cleaned[chart_labels_mean_cols]

Unnamed: 0,HR_mean,SpO2_mean,Temp(F)_mean,RR_mean,ABPm_mean,ABPd_mean,ABPs_mean,NBPd_mean,NBPm_mean,NBPs_mean,FiO2_mean,PH_mean,PCO2_mean,SaO2_mean,PlateletCount_mean,Albumin_mean,Cholesterol_mean,Sodium_mean,PO2_mean
0,96.500000,96.300000,98.966667,20.700000,,,,54.100000,62.300000,88.900000,,,,,,,,132.000000,
1,73.636364,98.909091,98.133333,20.545455,,,,83.272727,97.545455,142.454545,40.000000,,,,,,,,
2,93.296296,96.074074,99.066667,21.259259,,,,81.333333,93.500000,136.296296,,,,,285.0,,,138.000000,
3,79.600000,93.880000,98.133333,16.640000,,,,73.478261,83.217391,115.869565,,,,,299.0,,,139.000000,
4,70.800000,97.648649,98.989130,19.044444,,,,74.669683,90.529412,129.176471,45.000000,7.355,54.000000,91.0,136.2,3.075,,136.500000,75.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22977,78.969512,97.433735,97.915000,13.477273,,,,497.085890,84.453988,118.865031,52.500000,,39.666667,,135.0,2.800,,141.571429,127.333333
22978,89.743119,95.008929,98.407692,19.870370,76.461538,54.538462,114.692308,57.708333,72.510204,116.468750,60.555556,,60.666667,98.0,384.4,,,139.800000,130.000000
22979,85.463415,95.463415,98.918182,19.219512,,,,62.342105,75.763158,120.473684,,,,,233.0,,,136.000000,
22980,59.453039,97.644444,98.548837,15.961326,95.810811,82.805556,127.472222,79.181818,93.548872,140.431818,42.000000,,34.250000,,130.5,3.900,,139.750000,167.750000


In [11]:
df_master_cleaned[chart_labels_mean_cols].isna().sum()

HR_mean                   6
SpO2_mean                36
Temp(F)_mean            329
RR_mean                  11
ABPm_mean             15452
ABPd_mean             15541
ABPs_mean             15540
NBPd_mean               225
NBPm_mean               211
NBPs_mean               223
FiO2_mean             11129
PH_mean               15471
PCO2_mean             12528
SaO2_mean             17876
PlateletCount_mean      865
Albumin_mean          14941
Cholesterol_mean      21870
Sodium_mean             722
PO2_mean              12528
dtype: int64

In [12]:
df_master_cleaned[chart_labels_max_cols].head()

Unnamed: 0,HR_max,RR_max,ABPm_max,ABPd_max,ABPs_max,NBPd_max,NBPm_max,NBPs_max,FiO2_max,PCO2_max,Sodium_max
0,105.0,24.0,,,,59.0,67.0,95.0,,,132.0
1,80.0,25.0,,,,127.0,135.0,158.0,40.0,,
2,106.0,27.0,,,,123.0,130.0,153.0,,,138.0
3,96.0,22.0,,,,97.0,114.0,167.0,,,139.0
4,87.0,32.0,,,,123.0,130.0,193.0,100.0,60.0,140.0


In [13]:
df_master_cleaned[chart_labels_max_cols].isna().sum()

HR_max            6
RR_max           11
ABPm_max      15452
ABPd_max      15541
ABPs_max      15540
NBPd_max        225
NBPm_max        211
NBPs_max        223
FiO2_max      11129
PCO2_max      12528
Sodium_max      722
dtype: int64

In [14]:
df_master_cleaned[chart_labels_min_cols].head()

Unnamed: 0,HR_min,SpO2_min,RR_min,ABPm_min,ABPd_min,ABPs_min,NBPd_min,NBPm_min,NBPs_min,PCO2_min,Sodium_min
0,91.0,94.0,16.0,,,,41.0,56.0,82.0,,132.0
1,68.0,96.0,14.0,,,,69.0,83.0,130.0,,
2,78.0,92.0,13.0,,,,67.0,80.0,117.0,,138.0
3,66.0,90.0,11.0,,,,57.0,66.0,94.0,,139.0
4,38.0,45.0,8.0,,,,12.0,46.0,86.0,49.0,132.0


In [15]:
df_master_cleaned[chart_labels_min_cols].isna().sum()

HR_min            6
SpO2_min         36
RR_min           11
ABPm_min      15452
ABPd_min      15541
ABPs_min      15540
NBPd_min        225
NBPm_min        211
NBPs_min        223
PCO2_min      12528
Sodium_min      722
dtype: int64

### Lab Values

In [16]:
df_master_cleaned[lab_labels_mean_cols]

Unnamed: 0,Tidal_Volume_mean,Urea_Nitrogren_mean,Alkaline_Phosphatase_mean,Calcium_Total_mean,Chloride_mean,Glucose_mean,Lactate_mean,Magnesium_mean,Phosphate_mean,Potassium_mean,Bilirubin,Troponin-T_mean,Hematocrit_mean,Hemoglobin_mean,ALT_mean,Creatinine_mean,PTT_mean,WBC_mean,Fibrinogen
0,,32.000000,77.500000,9.033333,100.333333,114.333333,,2.133333,2.333333,4.966667,2.200000,,33.450000,,105.000000,0.433333,35.900000,,
1,,45.285714,61.000000,9.300000,106.000000,139.285714,,2.250000,4.783333,4.314286,0.300000,0.130000,26.783333,,12.000000,2.071429,47.400000,,
2,,9.000000,,8.500000,103.200000,102.600000,,2.100000,2.600000,3.920000,,,35.200000,,,0.500000,33.650000,,
3,,11.400000,56.000000,9.020000,104.000000,92.200000,,2.025000,3.960000,4.240000,0.400000,,37.900000,,12.000000,0.460000,32.550000,,
4,400.0,25.611111,56.727273,8.994444,95.777778,149.444444,1.800000,2.194444,3.172222,4.638889,0.372727,0.060000,29.436842,10.35,521.636364,0.905556,58.061765,12.0,339.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22977,400.0,3.625000,,8.387500,107.125000,109.250000,0.700000,1.988889,3.062500,3.700000,,,38.112500,,29.000000,0.625000,31.500000,,
22978,400.0,10.285714,,9.685714,102.285714,111.285714,0.800000,1.942857,3.057143,3.942857,,,34.771429,12.00,,0.671429,29.200000,,
22979,,9.500000,,9.400000,103.500000,101.666667,,1.925000,2.750000,4.075000,,,33.975000,,,0.700000,,28.5,
22980,,17.916667,50.000000,8.572727,104.583333,111.500000,2.266667,2.133333,2.858333,4.000000,0.500000,,38.138462,,37.666667,0.691667,26.140000,58.0,


In [17]:
df_master_cleaned[lab_labels_mean_cols].isna().sum()

Tidal_Volume_mean            18262
Urea_Nitrogren_mean            209
Alkaline_Phosphatase_mean     7658
Calcium_Total_mean             360
Chloride_mean                  208
Glucose_mean                   213
Lactate_mean                  6607
Magnesium_mean                 260
Phosphate_mean                 354
Potassium_mean                 207
Bilirubin                     7569
Troponin-T_mean              15981
Hematocrit_mean                226
Hemoglobin_mean              16581
ALT_mean                      7514
Creatinine_mean                205
PTT_mean                      2257
WBC_mean                     12210
Fibrinogen                   16882
dtype: int64

In [18]:
df_master_cleaned[lab_labels_max_cols].head()

Unnamed: 0,Potassium_max,Troponin-T_max,Hemoglobin_max,WBC_max
0,5.2,,,
1,4.5,0.15,,
2,4.2,,,
3,4.3,,,
4,7.1,0.08,12.1,21.0


In [19]:
df_master_cleaned[lab_labels_max_cols].isna().sum()

Potassium_max       207
Troponin-T_max    15981
Hemoglobin_max    16581
WBC_max           12210
dtype: int64

In [20]:
df_master_cleaned[lab_labels_min_cols].head()

Unnamed: 0,Potassium_min,Hemoglobin_min,WBC_min
0,4.7,,
1,4.2,,
2,3.6,,
3,4.1,,
4,3.6,8.6,3.0


In [21]:
df_master_cleaned[lab_labels_min_cols].isna().sum()

Potassium_min       207
Hemoglobin_min    16581
WBC_min           12210
dtype: int64

### Disease label (Cardiomegaly) analysis

In [16]:
# Analysis of MIMIC features 
print(df_master_cleaned.loc[:, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[:, ['ethnicity']].value_counts())
print(df_master_cleaned.loc[:, ['ethnicity']].value_counts(normalize=True))
print(df_master_cleaned.loc[:, ['gender']].value_counts())
print(df_master_cleaned.loc[:, ['gender']].value_counts(normalize=True))

         anchor_age           los
count  22982.000000  22982.000000
mean      65.132756      3.558581
std       16.829952      4.908022
min       18.000000      0.005984
25%       55.000000      1.122344
50%       67.000000      2.006146
75%       78.000000      3.859320
max       99.000000     99.638449
ethnicity      
White              15394
Black               3123
Other               2641
Hispanic/Latino     1038
Asian                786
Name: count, dtype: int64
ethnicity      
White              0.669829
Black              0.135889
Other              0.114916
Hispanic/Latino    0.045166
Asian              0.034201
Name: proportion, dtype: float64
gender
M         12573
F         10409
Name: count, dtype: int64
gender
M         0.54708
F         0.45292
Name: proportion, dtype: float64


In [17]:
# Analysis of MIMIC features for class 1 (cardiomegaly positive)
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['ethnicity']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['ethnicity']].value_counts(normalize=True))
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['gender']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['gender']].value_counts(normalize=True))

        anchor_age          los
count  4867.000000  4867.000000
mean     68.320526     3.815205
std      15.735692     5.361110
min      19.000000     0.030359
25%      58.000000     1.204138
50%      70.000000     2.166574
75%      81.000000     4.131302
max      99.000000    99.638449
ethnicity      
White              3219
Black               793
Other               495
Hispanic/Latino     201
Asian               159
Name: count, dtype: int64
ethnicity      
White              0.661393
Black              0.162934
Other              0.101705
Hispanic/Latino    0.041299
Asian              0.032669
Name: proportion, dtype: float64
gender
M         2528
F         2339
Name: count, dtype: int64
gender
M         0.519416
F         0.480584
Name: proportion, dtype: float64


In [18]:
# Analysis of MIMIC features for class 0 (cardiomegaly negative)
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['ethnicity']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['ethnicity']].value_counts(normalize=True))
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['gender']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['gender']].value_counts(normalize=True))

        anchor_age          los
count  6556.000000  6556.000000
mean     60.658938     3.091404
std      17.703135     4.407884
min      18.000000     0.011481
25%      50.000000     1.020657
50%      62.000000     1.803345
75%      74.000000     3.174002
max      98.000000    95.838218
ethnicity      
White              4296
Black               905
Other               772
Hispanic/Latino     339
Asian               244
Name: count, dtype: int64
ethnicity      
White              0.655278
Black              0.138041
Other              0.117755
Hispanic/Latino    0.051708
Asian              0.037218
Name: proportion, dtype: float64
gender
M         3602
F         2954
Name: count, dtype: int64
gender
M         0.54942
F         0.45058
Name: proportion, dtype: float64


# 1. Get MIMIC-IV data (non-imaging) 

In [None]:
# Import functions
import pandas as pd

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import filter_pd_read_chunkwise

In [None]:
# MIMIC-IV: Extract necessary features chunkwise
df_icu_timeseries = filter_pd_read_chunkwise(
    file_path=chart_events_path,
    filter_col=filter_col,
    filter_list=ChartItems.keys(),
    chunksize=chunk_size,
)

In [None]:
df_icu_lab = filter_pd_read_chunkwise(
    file_path=lab_events_path,
    filter_col=filter_col,
    filter_list=LabItems.keys(),
    chunksize=chunk_size,
)

In [None]:
df_icu_timeseries.to_pickle(relevant_chart_events_save_path)

In [None]:
df_icu_lab.to_pickle(relevant_lab_events_save_path)

# 2. Match MIMIC-IV (non-imaging) and MIMIC-CXR (imaging) data

In [12]:
# Import functions
import os
import datetime
import numpy as np
import pandas as pd
from typing import List, Dict
import matplotlib.pyplot as plt

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.utils.pandas_utils import explode, create_pivot, filter_df_isin
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import (x_ray_dataframe_generator, x_ray_dataframe_generator_v2, icu_xray_matcher, icu_xray_matcher_v2)

In [13]:
df_split = pd.read_csv(df_split_path)
df_metadata = pd.read_csv(cxr_metadata_path, header=0, sep=',')
df_cxr_records = pd.read_csv(cxr_records_path, header=0, sep=',')
df_nb = pd.read_csv(negbio_path)
df_cx = pd.read_csv(chexpert_path)

In [None]:
# For Cardiomegaly Only
#df_xray = x_ray_dataframe_generator(
#    label=label,
#    df_cxr_records=df_cxr_records,
#    df_nb=df_nb,
#    df_cx=df_cx,
#    df_cxr_meta_data=df_metadata,
#    df_split=df_split,
#    view='PA')

In [14]:
# MIMIC-CXR: Create X-Ray dataframes (the table will only contain the paths to the actual pictures)
labels = [
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices']

df_xray_v2 = x_ray_dataframe_generator_v2(
    labels=labels,
    df_cxr_records=df_cxr_records,
    df_nb=df_nb,
    df_cx=df_cx,
    df_cxr_meta_data=df_metadata,
    df_split=df_split)

In [16]:
# Link X-Ray to ICU stays if in certain time window defined by days_before_icu, xray_gap_after_icu, and xray_max_time_after_icu
df_icu_stays = pd.read_csv(icu_stays_path)

df_icu_xray_v2 = icu_xray_matcher_v2(
    labels=labels,
    days_before_icu=days_before_icu,
    xray_gap_after_icu=xray_gap_after_icu,
    xray_max_time_after_icu=xray_max_time_after_icu,
    df_xray=df_xray_v2,
    df_icu_stays=df_icu_stays)

In [17]:
df_icu_xray_v2.to_pickle(df_icu_xray_path)

# 3. Combine features and clean data

In [9]:
# Import functions
import sys
import importlib
import pandas as pd
import CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions as data_pipeline_functions
importlib.reload(data_pipeline_functions)

from typing import List, Dict
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.utils.pandas_utils import explode, create_pivot, filter_df_isin
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import explode_icu_stays, dfCleaningNoIDP
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import SignalTableGeneratorNoIDP

In [21]:
# Load data
df_patients = pd.read_csv(patients_table_path)

In [22]:
df_admissions = pd.read_csv(admissions_table_path)

In [18]:
df_icu_xray = pd.read_pickle(df_icu_xray_path)

In [20]:
df_icu_xray['StudyDate']

0        21800723
1        21890627
2        21571118
3        21571218
4        21310110
           ...   
24269    21610831
24270    21970803
24271    21970726
24272    21481119
24273    21451102
Name: StudyDate, Length: 24274, dtype: int64

In [23]:
df_icu_lab = pd.read_pickle(relevant_lab_events_save_path)

In [24]:
df_icu_timeseries = pd.read_pickle(relevant_chart_events_save_path)

In [25]:
# edit name of df_admissions column if data taken from versions after MIMIC-IV v1.0 as 'ethnicity' column was renamed 'race' in following version (v2.0)
if MIMIC_IV_version != 1:
    df_admissions.rename(columns={'race':'ethnicity'}, inplace=True)

In [26]:
# collate all features (MIMIC-IV feautres, MIMIC-CXR file paths, biomarker values) into one master table
df_master = SignalTableGeneratorNoIDP(df_icu_xray, 
                                 df_icu_timeseries=df_icu_timeseries, 
                                 df_icu_lab=df_icu_lab, 
                                 df_patients=df_patients, 
                                 df_admissions=df_admissions, 
                                 chart_labels_mean=chart_labels_mean, 
                                 chart_labels_max=chart_labels_max, 
                                 chart_labels_min=chart_labels_min, 
                                 lab_labels_mean=lab_labels_mean, 
                                 lab_labels_max=lab_labels_max, 
                                 lab_labels_min=lab_labels_min, 
                                 average_by=average_by)

In [27]:
# Clean master table
df_master_cleaned = dfCleaningNoIDP(df_master)

In [28]:
# Save cleaned master dataframe
df_master_cleaned.to_pickle(feature_folder + 'MIMIC_features_v3.pkl')

### Add IDPs

In [12]:
import os
import numpy as np
import pandas as pd

# Biomarker and df paths
ctr_path =  '/home/ays124/mimic/CardiomegalyBiomarkers/ctr-cpar-cardiomegaly/CTRs.csv'
cpar_path = '/home/ays124/mimic/CardiomegalyBiomarkers/ctr-cpar-cardiomegaly/CPARs.csv'
df_master = pd.read_pickle('/home/ays124/mimic/CardiomegalyBiomarkers/Cardiomegaly_Classification/MIMIC_features/MIMIC_features_v3.pkl')
df_ctr = pd.read_csv(ctr_path)
df_cpar = pd.read_csv(cpar_path)

# Merge CTR and CPAR dataframes onto df_icu_xray_patient_admission_timeseries_lab table
df_master['dicom_file'] = pd.Series(df_master.path.str[-48:-4])

df_master = df_master.merge(df_ctr, on = 'dicom_file')

df_master = df_master.merge(df_cpar, on = 'dicom_file')

df_master.drop(labels='dicom_file', axis=1, inplace=True)

# Replace unrealistic CTR and CPAR values with NaN
df_master.loc[df_master.CTR >= 1, 'CTR'] = np.nan
df_master.loc[df_master.CPAR >= 1, 'CPAR'] = np.nan

In [14]:
df_master.shape

(2375, 104)

In [15]:
df_master[df_master['split'] == 'train'].shape

(2306, 104)

In [16]:
df_master[df_master['split'] == 'test'].shape

(46, 104)

In [13]:
df_master.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,Match,study_id,...,WBC_mean,Hemoglobin_max,Potassium_max,Troponin-T_max,WBC_max,Hemoglobin_min,Potassium_min,WBC_min,CTR,CPAR
0,10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.118032,1,52067803,...,,,4.2,,,,3.6,,0.47855,0.336592
1,10003019,22774359,30676350,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2175-10-08 18:58:00,2175-10-09 11:59:16,0.709213,1,58505074,...,,,4.8,,,,3.1,,0.528035,0.378997
2,10004457,23251352,31494479,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2141-12-17 10:24:25,2141-12-18 14:16:17,1.161019,1,55439624,...,,10.5,4.7,,,6.8,3.9,,0.47013,0.279968
3,10010471,29842315,32119961,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2155-12-02 20:33:00,2155-12-07 18:19:18,4.907153,1,51674194,...,,,6.0,7.31,,,4.7,,0.592068,0.411266
4,10021938,27154822,33083787,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2181-10-25 11:35:00,2181-10-26 20:53:57,1.38816,1,54308908,...,,,5.5,,,,4.5,,0.53882,


In [19]:
df_master.to_pickle('/home/ays124/mimic/CardiomegalyBiomarkers/Cardiomegaly_Classification/MIMIC_features/MIMIC_features_with_IDPs.pkl')