# 0. Initiate

## Paths

In [1]:
# Global paths

# Raw data
mimic_iv_path = '/n/scratch/users/a/ays124/mimiciv/'
icu_stays_path = mimic_iv_path + 'icu/icustays.csv.gz'
chart_events_path = mimic_iv_path + 'icu/chartevents.csv.gz'
patients_table_path = mimic_iv_path + 'hosp/patients.csv.gz'
admissions_table_path = mimic_iv_path + 'hosp/admissions.csv.gz'
lab_events_path = mimic_iv_path + 'hosp/labevents.csv.gz'

mimic_cxr_path = '/n/scratch/users/a/ays124/mimic-cxr-jpg/'
cxr_records_path = mimic_cxr_path + 'cxr-record-list.csv.gz'
cxr_metadata_path = mimic_cxr_path + 'mimic-cxr-2.0.0-metadata.csv.gz'
df_split_path = mimic_cxr_path + 'mimic-cxr-2.0.0-split.csv.gz'
negbio_path = mimic_cxr_path + 'mimic-cxr-2.0.0-negbio.csv.gz'
chexpert_path = mimic_cxr_path + 'mimic-cxr-2.0.0-chexpert.csv.gz'



# Features intermediate 
feature_folder = 'CardiomegalyBiomarkers/Cardiomegaly_Classification/MIMIC_features/'

# MIMIC intermediate 
relevant_chart_events_save_path = feature_folder + 'RelevantChartEvents.pkl'
relevant_lab_events_save_path = feature_folder + 'RelevantLabEvents.pkl'
df_icu_xray_path =  feature_folder + 'IcuXrayMatched.pkl'

# Biomarkers
ctr_path =  'CardiomegalyBiomarkers/Biomarker_Extraction/save_folder/CTR/CTRs.csv'
cpar_path = 'CardiomegalyBiomarkers/Biomarker_Extraction/save_folder/CPAR/CPARs.csv'

# Final cleaned features
features_path = feature_folder + 'MIMIC_features.pkl'

##  Parameters

In [2]:
# General Parameters
label = 'Cardiomegaly'  # Define label of target disease ('Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices')
chunk_size = 10**7      # When extracting relevant lab and chart events we iterate through the original files in chunks of size 'chunk_size'.

# MIMIC-CXR (imaging) Parameters
view = None     # Choose the X-ray view position you're interested in, AP or PA (or None)

# MIMIC-IV (non-imaging) Parameters
MIMIC_IV_version = 3            # Version of MIMIC-IV downloaded
days_before_icu = 365           # The number of days before ICU admission that we look for x-rays in
xray_gap_after_icu = 0          # You can choose to include a 'gap' after ICU discharge in which you don't look for any X-rays
xray_max_time_after_icu = 90    # If you don't want a gap, xray_max_time_after_icu is just the number of days after ICU discharge that we look for x-rays in. We look for x-rays which are between Gap and Gap + xray_max_time_after_icu days after out-time
average_by = 'Stay'             # 'Hourly' to average readings every hour and have one hour per row; 'Stay', to average chart and lab values across a stay
filter_col = 'itemid'   	    # Define features to use for time-series prep

In [3]:
# Labels of desiered non-imaging features

# Lables
chart_labels_mean = {
    220045: 'HR_mean',
    220277: 'SpO2_mean',
    223761: 'Temp(F)_mean',
    220210: 'RR_mean',
    220052: 'ABPm_mean',
    220051: 'ABPd_mean',
    220050: 'ABPs_mean',
    220180: 'NBPd_mean',
    220181: 'NBPm_mean',
    220179: 'NBPs_mean',
    223835: 'FiO2_mean',
    220274: 'PH_mean',
    220235: 'PCO2_mean',
    220227: 'SaO2_mean',
    227457: 'PlateletCount_mean',
    227456: 'Albumin_mean',
    220603: 'Cholesterol_mean',
    220645: 'Sodium_mean',
    220224: 'PO2_mean',
}

chart_labels_max = {
    220045: 'HR_max',
    220210: 'RR_max',
    220052: 'ABPm_max',
    220051: 'ABPd_max',
    220050: 'ABPs_max',
    220180: 'NBPd_max',
    220181: 'NBPm_max',
    220179: 'NBPs_max',
    223835: 'FiO2_max',
    220235: 'PCO2_max',
    220645: 'Sodium_max',
}

chart_labels_min = {
    220045: 'HR_min',
    220277: 'SpO2_min',
    220210: 'RR_min',
    220052: 'ABPm_min',
    220051: 'ABPd_min',
    220050: 'ABPs_min',
    220180: 'NBPd_min',
    220181: 'NBPm_min',
    220179: 'NBPs_min',
    220235: 'PCO2_min',
    220645: 'Sodium_min',
}

lab_labels_mean = {
    50826: 'Tidal_Volume_mean',
    51006: 'Urea_Nitrogren_mean',
    50863: 'Alkaline_Phosphatase_mean',
    50893: 'Calcium_Total_mean',
    50902: 'Chloride_mean',
    50931: 'Glucose_mean',
    50813: 'Lactate_mean',
    50960: 'Magnesium_mean',
    50970: 'Phosphate_mean',
    50971: 'Potassium_mean',
    50885: 'Bilirubin',
    51003: 'Troponin-T_mean',
    51221: 'Hematocrit_mean',
    50811: 'Hemoglobin_mean',
    50861: 'ALT_mean',
    50912: 'Creatinine_mean',
    51275: 'PTT_mean',
    51516: 'WBC_mean',
    51214: 'Fibrinogen',
}

lab_labels_max = {
    50971: 'Potassium_max',
    51003: 'Troponin-T_max',
    50811: 'Hemoglobin_max',
    51516: 'WBC_max',
}

lab_labels_min = {
    50971: 'Potassium_min',
    50811: 'Hemoglobin_min',
    51516: 'WBC_min',
}

# Aggregation of all laboratory items into LabItems
LabItems = dict(lab_labels_mean)
LabItems.update(lab_labels_max)
LabItems.update(lab_labels_min)

# Aggregation of the vital signs / chart items into ChartItems
ChartItems = dict(chart_labels_mean)
ChartItems.update(chart_labels_max)
ChartItems.update(chart_labels_min)

# 1. Data pipeline

## Get MIMIC-IV data (non-imaging) 

In [1]:
# Import functions
import pandas as pd

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import filter_pd_read_chunkwise

In [5]:
# MIMIC-IV: Extract necessary features chunkwise
df_icu_timeseries = filter_pd_read_chunkwise(
    file_path=chart_events_path,
    filter_col=filter_col,
    filter_list=ChartItems.keys(),
    chunksize=chunk_size,
)

In [6]:
df_icu_lab = filter_pd_read_chunkwise(
    file_path=lab_events_path,
    filter_col=filter_col,
    filter_list=LabItems.keys(),
    chunksize=chunk_size,
)

In [11]:
df_icu_timeseries.to_pickle(relevant_chart_events_save_path)

In [12]:
df_icu_lab.to_pickle(relevant_lab_events_save_path)

## Match MIMIC-IV (non-imaging) and MIMIC-CXR (imaging) data

In [3]:
# Import functions
import os
import datetime
import numpy as np
import pandas as pd
from typing import List, Dict
import matplotlib.pyplot as plt

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.utils.pandas_utils import explode, create_pivot, filter_df_isin
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import (x_ray_dataframe_generator, x_ray_dataframe_generator_v2, icu_xray_matcher)


In [4]:
df_split = pd.read_csv(df_split_path)
df_metadata = pd.read_csv(cxr_metadata_path, header=0, sep=',')
df_cxr_records = pd.read_csv(cxr_records_path, header=0, sep=',')
df_nb = pd.read_csv(negbio_path)
df_cx = pd.read_csv(chexpert_path)

In [None]:
# For Cardiomegaly Only
df_xray = x_ray_dataframe_generator(
    label=label,
    df_cxr_records=df_cxr_records,
    df_nb=df_nb,
    df_cx=df_cx,
    df_cxr_meta_data=df_metadata,
    df_split=df_split,
    view='PA')

In [5]:
# MIMIC-CXR: Create X-Ray dataframes (the table will only contain the paths to the actual pictures)
labels = [
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices']

df_xray_v2 = x_ray_dataframe_generator_v2(
    labels=labels,
    df_cxr_records=df_cxr_records,
    df_nb=df_nb,
    df_cx=df_cx,
    df_cxr_meta_data=df_metadata,
    df_split=df_split)

In [None]:
# Remove all rows with 'Uncertain' or 'Disagreement' values from the df
#mask = ~df_combined_v2.apply(
#   lambda row: row.isin(['Uncertain', 'Disagreement']).any(), axis=1
#)
#df_combined_filtered_v2 = df_combined_v2[mask]

In [6]:
df_xray_v2

Unnamed: 0,dicom_id,subject_id,study_id,StudyDate,StudyTime,ViewPosition,path,split,No Finding,Enlarged Cardiomediastinum,...,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,21800506,213014.531,PA,files/p10/p10000032/s50414267/02aa804e-bde0afd...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,21800626,165500.312,PA,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,21800723,80556.875,AP,files/p10/p10000032/s53911762/68b5c4b1-227d048...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
5,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,10000032,53911762,21800723,80556.875,AP,files/p10/p10000032/s53911762/fffabebf-74fd3a1...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
6,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,10000032,56699142,21800805,234424.765,AP,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377089,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733,57132437,21520708,224550.171,PA,files/p19/p19999733/s57132437/3fcd0406-9b11160...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
377090,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733,57132437,21520708,224550.171,PA,files/p19/p19999733/s57132437/428e2c18-5721d8f...,train,Positive,Negative,...,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative,Negative
377092,58766883-376a15ce-3b323a28-6af950a0-16b793bd,19999987,55368167,21451104,51448.218,AP,files/p19/p19999987/s55368167/58766883-376a15c...,train,,,...,Negative,,,,Positive,Negative,Negative,,,
377093,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987,58621812,21451102,202809.234,AP,files/p19/p19999987/s58621812/7ba273af-3d290f8...,train,,,...,,,,,Positive,,,,,Positive


In [None]:
# Link X-Ray to ICU stays if in certain time window defined by days_before_icu, xray_gap_after_icu, and xray_max_time_after_icu
df_icu_stays = pd.read_csv(icu_stays_path)

df_icu_xray = icu_xray_matcher(
    label=label,
    days_before_icu=days_before_icu,
    xray_gap_after_icu=xray_gap_after_icu,
    xray_max_time_after_icu=xray_max_time_after_icu,
    df_xray=df_xray_v2,
    df_icu_stays=df_icu_stays)

In [None]:
df_icu_xray.to_pickle(df_icu_xray_path)

## Combine features and clean data

In [1]:
# Import functions
import pandas as pd

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.utils.pandas_utils import explode, create_pivot, filter_df_isin
from typing import List, Dict
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import explode_icu_stays, dfCleaningNoIDP
from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.data_pipeline_functions import SignalTableGeneratorNoIDP

In [6]:
# Load data
df_patients = pd.read_csv(patients_table_path)
# df_cvtr = pd.read_csv(ctr_path)
# df_cpar = pd.read_csv(cpar_path)

In [7]:
print("Number of entries: " + str(len(df_patients)))
print("Number of patients: " + str(df_patients.subject_id.nunique()))

Number of entries: 364627
Number of patients: 364627


In [8]:
df_admissions = pd.read_csv(admissions_table_path)

In [9]:
print("Number of entries: " + str(len(df_admissions)))
print("Number of patients: " + str(df_admissions.subject_id.nunique()))

Number of entries: 546028
Number of patients: 223452


In [10]:
df_icu_xray = pd.read_pickle(df_icu_xray_path)

In [14]:
df_icu_xray

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,Match,study_id,Label,ViewPosition,path,EarlyBoundary,PostGapStart,PostGapStop,Cardiomegaly
0,10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817,1,50712381,0,AP,files/p10/p10001884/s50712381/7b25b3ed-e780a52...,2130-01-11 04:20:05,2131-01-20 08:27:30,2131-04-20 08:27:30,Positive
1,10002013,23581541,39060235,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 10:00:53,2160-05-19 17:33:33,1.314352,1,52163036,0,PA,files/p10/p10002013/s52163036/1e647043-eed3576...,2159-05-19 10:00:53,2160-05-19 17:33:33,2160-08-17 17:33:33,Positive
2,10002428,20321825,34807493,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2156-04-30 21:53:00,2156-05-02 22:27:20,2.023843,1,54020701,0,AP,files/p10/p10002428/s54020701/40a4d537-de28a3a...,2155-05-01 21:53:00,2156-05-02 22:27:20,2156-07-31 22:27:20,Negative
3,10002428,23473524,35479615,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2156-05-11 14:49:34,2156-05-22 14:16:46,10.977222,1,52460896,0,AP,files/p10/p10002428/s52460896/54c2ed5c-f4fbc20...,2155-05-12 14:49:34,2156-05-22 14:16:46,2156-08-20 14:16:46,Negative
4,10002428,28662225,33987268,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2156-04-12 16:24:18,2156-04-17 15:57:08,4.981134,1,59098825,0,AP,files/p10/p10002428/s59098825/2337617e-d39e1d8...,2155-04-13 16:24:18,2156-04-17 15:57:08,2156-07-16 15:57:08,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16220,19998330,24492004,32641669,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2178-10-01 08:51:00,2178-10-03 23:25:08,2.607037,1,50226892,0,,files/p19/p19998330/s50226892/b675377a-a139712...,2177-10-01 08:51:00,2178-10-03 23:25:08,2179-01-01 23:25:08,Positive
16221,19998770,28494258,37676535,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2182-06-14 15:31:41,2182-06-15 08:45:48,0.718137,1,59764676,0,AP,files/p19/p19998770/s59764676/cc97d929-a31eae7...,2181-06-14 15:31:41,2182-06-15 08:45:48,2182-09-13 08:45:48,Negative
16222,19999068,21606769,30143796,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2161-08-24 05:26:00,2161-08-30 23:48:04,6.765324,1,52793893,0,,files/p19/p19999068/s52793893/af21a5e8-fcfe93a...,2160-08-24 05:26:00,2161-08-30 23:48:04,2161-11-28 23:48:04,Negative
16223,19999287,20175828,35165301,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2197-08-04 00:02:00,2197-08-08 16:58:17,4.705752,1,53282218,0,PA,files/p19/p19999287/s53282218/5a5eddf4-b64e5e4...,2196-08-04 00:02:00,2197-08-08 16:58:17,2197-11-06 16:58:17,Negative


In [15]:
print("Number of entries: " + str(len(df_icu_xray)))
print("Number of patients: " + str(df_icu_xray.subject_id.nunique()))

Number of entries: 16225
Number of patients: 10215


In [7]:
df_icu_lab = pd.read_pickle(relevant_lab_events_save_path)

In [None]:
df_icu_timeseries = pd.read_pickle(relevant_chart_events_save_path)

In [11]:
# edit name of df_admissions column if data taken from versions after MINIC-IV v1.0 as 'ethnicity' column was renamed 'race' in following version (v2.0)
if MIMIC_IV_version != 1:
    df_admissions.rename(columns={'race':'ethnicity'}, inplace=True)

In [15]:
# collate all features (MIMIC-IV feautres, MIMIC-CXR file paths, biomarker values) into one master table
df_master = SignalTableGeneratorNoIDP(df_icu_xray, 
                                 df_icu_timeseries=df_icu_timeseries, 
                                 df_icu_lab=df_icu_lab, 
                                 df_patients=df_patients, 
                                 df_admissions=df_admissions, 
                                 chart_labels_mean=chart_labels_mean, 
                                 chart_labels_max=chart_labels_max, 
                                 chart_labels_min=chart_labels_min, 
                                 lab_labels_mean=lab_labels_mean, 
                                 lab_labels_max=lab_labels_max, 
                                 lab_labels_min=lab_labels_min, 
                                 average_by=average_by)

In [19]:
# Clean master table
df_master_cleaned = dfCleaningNoIDP(df_master)

In [30]:
# Mapping of original ethnicities to standardized categories
ethnicity_mapping = {
    "WHITE": "White",
    "WHITE - OTHER EUROPEAN": "White",
    "WHITE - RUSSIAN": "White",
    "WHITE - EASTERN EUROPEAN": "White",
    "WHITE - BRAZILIAN": "White",
    "BLACK/AFRICAN AMERICAN": "Black",
    "BLACK/CAPE VERDEAN": "Black",
    "BLACK/CARIBBEAN ISLAND": "Black",
    "BLACK/AFRICAN": "Black",
    "ASIAN": "Asian",
    "ASIAN - CHINESE": "Asian",
    "ASIAN - SOUTH EAST ASIAN": "Asian",
    "ASIAN - ASIAN INDIAN": "Asian",
    "ASIAN - KOREAN": "Asian",
    "HISPANIC/LATINO - PUERTO RICAN": "Hispanic/Latino",
    "HISPANIC/LATINO - DOMINICAN": "Hispanic/Latino",
    "HISPANIC/LATINO - GUATEMALAN": "Hispanic/Latino",
    "HISPANIC/LATINO - SALVADORAN": "Hispanic/Latino",
    "HISPANIC OR LATINO": "Hispanic/Latino",
    "HISPANIC/LATINO - MEXICAN": "Hispanic/Latino",
    "HISPANIC/LATINO - HONDURAN": "Hispanic/Latino",
    "HISPANIC/LATINO - CUBAN": "Hispanic/Latino",
    "HISPANIC/LATINO - COLUMBIAN": "Hispanic/Latino",
    "HISPANIC/LATINO - CENTRAL AMERICAN": "Hispanic/Latino",
    "SOUTH AMERICAN": "Hispanic/Latino",
    "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER": "Asian",
    "AMERICAN INDIAN/ALASKA NATIVE": "Other",
    "MULTIPLE RACE/ETHNICITY": "Other",
    "OTHER": "Other",
    "UNKNOWN": "Other",
    "UNABLE TO OBTAIN": "Other",
    "PATIENT DECLINED TO ANSWER": "Other",
    "PORTUGUESE": "Other"
}

# Apply the mapping to the 'ethnicity' column
df_master_cleaned['race'] = df_master_cleaned['ethnicity'].map(ethnicity_mapping)

In [31]:
# Save cleaned master dataframe
df_master_cleaned.to_pickle(feature_folder + 'MIMIC_features.pkl')
df_master_cleaned.to_parquet(feature_folder + 'MIMIC_features.parquet')

# 3. Descriptive analysis

In [21]:
# Import functions
import pandas as pd

# Import cleaned master dataframe
df_master_cleaned = pd.read_pickle(feature_folder + 'MIMIC_features.pkl')

In [32]:
# Analysis of MIMIC features 
print(df_master_cleaned.loc[:, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[:, ['race']].value_counts())
print(df_master_cleaned.loc[:, ['race']].value_counts(normalize=True))
print(df_master_cleaned.loc[:, ['gender']].value_counts())
print(df_master_cleaned.loc[:, ['gender']].value_counts(normalize=True))

       Cardiomegaly    anchor_age           los
count  13901.000000  13901.000000  13901.000000
mean       0.732825     66.298612      4.379305
std        0.442500     16.291070      5.779057
min        0.000000     18.000000      0.005984
25%        0.000000     56.000000      1.302361
50%        1.000000     68.000000      2.432072
75%        1.000000     79.000000      4.931308
max        1.000000     99.000000     99.638449
race           
White              9313
Black              1898
Other              1609
Hispanic/Latino     608
Asian               473
Name: count, dtype: int64
race           
White              0.669952
Black              0.136537
Other              0.115747
Hispanic/Latino    0.043738
Asian              0.034026
Name: proportion, dtype: float64
gender
M         7598
F         6303
Name: count, dtype: int64
gender
M         0.546579
F         0.453421
Name: proportion, dtype: float64


In [33]:
# Analysis of MIMIC features for class 1 (cardiomegaly positive)
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['race']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['race']].value_counts(normalize=True))
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['gender']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['gender']].value_counts(normalize=True))

       Cardiomegaly    anchor_age           los
count       10187.0  10187.000000  10187.000000
mean            1.0     68.141062      4.408641
std             0.0     15.652659      5.786261
min             1.0     18.000000      0.011748
25%             1.0     58.000000      1.329063
50%             1.0     69.000000      2.501887
75%             1.0     80.000000      4.980411
max             1.0     99.000000     99.638449
race           
White              6781
Black              1523
Other              1101
Hispanic/Latino     450
Asian               332
Name: count, dtype: int64
race           
White              0.665652
Black              0.149504
Other              0.108079
Hispanic/Latino    0.044174
Asian              0.032591
Name: proportion, dtype: float64
gender
M         5471
F         4716
Name: count, dtype: int64
gender
M         0.537057
F         0.462943
Name: proportion, dtype: float64


In [34]:
# Analysis of MIMIC features for class 0 (cardiomegaly negative)
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['Cardiomegaly', 'anchor_age', 'los']].describe())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['race']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['race']].value_counts(normalize=True))
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['gender']].value_counts())
print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['gender']].value_counts(normalize=True))

       Cardiomegaly   anchor_age          los
count        3714.0  3714.000000  3714.000000
mean            0.0    61.245019     4.298841
std             0.0    16.927986     5.759262
min             0.0    18.000000     0.005984
25%             0.0    51.000000     1.252187
50%             0.0    63.000000     2.270689
75%             0.0    73.000000     4.718325
max             0.0    98.000000    77.740706
race           
White              2532
Other               508
Black               375
Hispanic/Latino     158
Asian               141
Name: count, dtype: int64
race           
White              0.681745
Other              0.136780
Black              0.100969
Hispanic/Latino    0.042542
Asian              0.037964
Name: proportion, dtype: float64
gender
M         2127
F         1587
Name: count, dtype: int64
gender
M         0.572698
F         0.427302
Name: proportion, dtype: float64


In [None]:
# Analysis of image derived image biomarker (CTR) by class + histograms of values
#print(df_master_cleaned.loc[:, ['CTR']].describe())
#print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['CTR']].describe())
#print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['CTR']].describe())
#
#df_master_cleaned.hist(column = 'CTR', bins=100, by='Cardiomegaly', range =[0,1])

In [None]:
# Analysis of image derived image biomarker (CPAR) by class + histograms of values
#print(df_master_cleaned.loc[:, ['CPAR']].describe())
#print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 1, ['CPAR']].describe())
#print(df_master_cleaned.loc[df_master_cleaned.Cardiomegaly == 0, ['CPAR']].describe())
#
#df_master_cleaned.hist(column = 'CPAR', bins=100, by='Cardiomegaly', range =[0,1])

In [None]:
# Additional analysis of CTR and CPAR success rates
#df_ctr = pd.read_csv(ctr_path)
#df_cpar = pd.read_csv(cpar_path)
#
#ctr_heart = (df_ctr.CTR == 2).sum()
#ctr_lungs = (df_ctr.CTR == 3).sum()
#ctr_both = (df_ctr.CTR == 4).sum()
#ctr = ctr_both+ctr_heart+ctr_lungs
#
#cpar_heart = (df_cpar.CPAR == 2).sum()
#cpar_lungs = (df_cpar.CPAR == 3).sum()
#cpar_both = (df_cpar.CPAR == 4).sum()
#cpar = cpar_both+cpar_heart+cpar_lungs
#
#print(f'CTR general success rate: {round((len(df_ctr)-ctr)/len(df_ctr)*100,3)}% (total of {ctr} errors)')
#print(f'\t of which are {ctr_heart} are heart based ({round(ctr_heart/ctr*100,3)}%)')
#print(f'\t of which are {ctr_lungs} are lung based ({round(ctr_lungs/ctr*100,3)}%)')
#print(f'\t of which are {ctr_both} are both based ({round(ctr_both/ctr*100,3)}%)\n')
#
#print(f'CPAR general success rate: {round((len(df_cpar)-cpar)/len(df_cpar)*100,3)}% (total of {cpar} errors)')
#print(f'\t of which are {cpar_heart} are heart based ({round(cpar_heart/cpar*100,3)}%)')
#print(f'\t of which are {cpar_lungs} are lung based ({round(cpar_lungs/cpar*100,3)}%)')
#print(f'\t of which are {cpar_both} are both based ({round(cpar_both/cpar*100,3)}%)')