In [1]:
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp, stats
import pandas as pd
import os
import sys
from tqdm import tqdm
import math

from scipy.stats import chi2_contingency
from scipy.stats import chi2, kstest, ranksums
import seaborn as sns

from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve

import warnings
warnings.filterwarnings('ignore')

from edm.utils.measures import perf_measure, calculate_output_statistics

pd.set_option('display.max_columns', None)

In [2]:
def print_statistics(df_cohort):
    df_pos = df_cohort[df_cohort["outcome"] == 1]
    df_neg = df_cohort[df_cohort["outcome"] == 0]
    print(f"Shape {df_pos.shape} for outcome = 1")
    print(f"Shape {df_neg.shape} for outcome = 0")
    print(f"---")
#     SpO2	RR	HR	Temp	SBP	DBP
    def print_numeric(col):
        pos_stat = f"{df_pos[col].describe()['50%']} [{df_pos[col].describe()['25%']}-{df_pos[col].describe()['75%']}]"
        neg_stat = f"{df_neg[col].describe()['50%']} [{df_neg[col].describe()['25%']}-{df_neg[col].describe()['75%']}]"
        pos_missing = df_pos[col].isna().sum()
        neg_missing = df_pos[col].isna().sum()
#         p_val = stats.ttest_ind(df_pos[col], df_neg[col], nan_policy="omit", equal_var=False).pvalue
        w, p_val = ranksums(df_pos[col], df_neg[col])

        print(f"{col}, {pos_stat}, {pos_missing}, {neg_stat}, {neg_missing}, {p_val}")
    
    def print_binary_col(col, val_1, val_2):
        contingency_table = [
            [df_pos[df_pos[col] == val_2].shape[0], df_neg[df_neg[col] == val_2].shape[0]],
            [df_pos[df_pos[col] == val_1].shape[0], df_neg[df_neg[col] == val_1].shape[0]]
        ]

#         print(contingency_table)

        stat, p, dof, expected = chi2_contingency(contingency_table, correction=False)
#         print(stat, p, dof, expected)

#         prob = 0.95
#         critical = chi2.ppf(prob, dof)
#         if abs(stat) >= critical:
#             print(f"Dependent (reject H0) at {stat}")
#         else:
#             print(f"Independent (fail to reject H0) at {stat}")

        pos_f = df_pos[df_pos[col] == val_1].shape[0]
        pos_m = df_pos[df_pos[col] == val_2].shape[0]
        neg_f = df_neg[df_neg[col] == val_1].shape[0]
        neg_m = df_neg[df_neg[col] == val_2].shape[0]
        pos_f_missing = df_pos[df_pos[col] == val_1][col].isna().sum()
        neg_f_missing = df_neg[df_neg[col] == val_1][col].isna().sum()
        pos_m_missing = df_pos[df_pos[col] == val_2][col].isna().sum()
        neg_m_missing = df_neg[df_neg[col] == val_2][col].isna().sum()
        print(f"{col} - {val_1}, {pos_f} ({round(100*pos_f/(pos_f+pos_m), 2)}), {pos_f_missing}, {neg_f} ({round(100*neg_f/(neg_f+neg_m), 2)}), {neg_f_missing}, {p}")
        print(f"{col} - {val_2}, {pos_m} ({round(100*pos_m/(pos_f+pos_m), 2)}), {pos_m_missing}, {neg_m} ({round(100*neg_m/(neg_f+neg_m), 2)}), {neg_m_missing},")
    
    def print_risk_factor(col):
        pos_missing = df_pos[col].isna().sum()
        neg_missing = df_neg[col].isna().sum()
        pval = stats.ttest_ind(df_pos[col], df_neg[col], nan_policy="omit", equal_var=False).pvalue
        print(f"{col}, {int(df_pos[col].sum())} ({round(100 * df_pos[col].sum() / (len(df_pos[col]) - int(df_pos[col].isna().sum())), 2)}), {pos_missing}, {int(df_neg[col].sum())} ({round(100 * df_neg[col].sum() / (len(df_neg[col]) - int(df_neg[col].isna().sum())), 2)}), {neg_missing}, {pval}")

    # ---
    print("Characteristic, Pos Statistic, Pos Missing, Neg Statistic, Neg Missing, p-value")
    print_numeric("Age")
    print_binary_col("Gender", "F", "M")
    print_binary_col("CC_CP_SOB", 0, 1)
    print_numeric("SpO2")
    print_numeric("RR")
    print_numeric("HR")
    print_numeric("Temp")
    print_numeric("SBP")
    print_numeric("DBP")
    print_numeric("First_trop")
    print_numeric("Max_trop")
    print_numeric("ED_LOS")
    print_numeric("First_trop_result_time-waveform_start_time")
    print_numeric("First_trop_result_time-Arrival_time")
    print_numeric("trim_length")
    print_risk_factor("Athero")
    print_risk_factor("HTN")
    print_risk_factor("HLD")
    print_risk_factor("DM")
    print_risk_factor("Obese")
    print_risk_factor("Smoking")

## Load Files

In [13]:
df_annotated_manual = pd.read_csv("/deep/group/ed-monitor/patient_data_v9/consolidated.filtered.csv")
print(df_annotated_manual.shape)

(10874, 77)


In [None]:
df_annotated_manual.describe()

In [4]:
df_pos = df_annotated_manual[df_annotated_manual["outcome"] == 1]
print(df_pos.shape)
df_neg = df_annotated_manual[df_annotated_manual["outcome"] == 0]
print(df_neg.shape)

(1057, 79)
(9817, 79)


## Statistics

In [19]:
print_statistics(df_annotated_manual)

Shape (1057, 79) for outcome = 1
Shape (9817, 79) for outcome = 0
---
Characteristic, Pos Statistic, Pos Missing, Neg Statistic, Neg Missing, p-value
Age, 71.0 [59.0-83.0], 0, 63.0 [47.0-76.0], 0, 6.4975853236443074e-43
Gender - F, 444 (42.09), 0, 5136 (52.32), 0, 2.587171245842113e-10
Gender - M, 611 (57.91), 0, 4680 (47.68), 0,
CC_CP_SOB - 0, 644 (60.93), 0, 6501 (66.22), 0, 0.0005697700577133987
CC_CP_SOB - 1, 413 (39.07), 0, 3316 (33.78), 0,
SpO2, 98.0 [96.0-100.0], 2, 99.0 [97.0-100.0], 2, 3.9983671651375525e-13
RR, 18.0 [18.0-22.0], 1, 18.0 [16.0-20.0], 1, 7.702467483918752e-19
HR, 87.0 [73.0-106.0], 1, 84.0 [72.0-99.0], 1, 3.283582954343181e-05
Temp, 36.7 [36.5-36.9], 65, 36.7 [36.5-36.9], 65, 0.038499824253701444
SBP, 137.0 [116.0-156.0], 3, 138.0 [123.0-153.0], 3, 0.02133306751585782
DBP, 78.0 [66.0-91.0], 3, 79.0 [69.0-89.0], 3, 0.13611389432796914
First_trop, 0.124 [0.072-0.327], 0, 0.0 [0.0-0.0], 0, 0.0
Max_trop, 0.164 [0.091-0.492], 0, 0.0 [0.0-0.0], 0, 0.0
ED_LOS, 6.47 [4

In [24]:
# First trop neg?
tot_patient = df_pos.shape[0]
mi_patients = df_pos[df_pos["First_trop"] > 0.055].shape[0]
print(f"")
print(f"Number of patients who had first trop pos {mi_patients}/{tot_patient} ({100*mi_patients/tot_patient}%)")
print(f"Number of patients who had first trop neg {(tot_patient-mi_patients)}/{tot_patient} ({100*(tot_patient-mi_patients)/tot_patient}%)")


Number of patients who had first trop pos 957/1057 (90.53926206244087%)
Number of patients who had first trop neg 100/1057 (9.460737937559129%)


In [26]:
# How long did MI patients who had first trop neg have to wait until pos trop result?
from dateutil import parser

wait_times = []
for i, row in df_pos[df_pos["First_trop"] <= 0.055].iterrows():
    trop_available = parser.parse(row["Max_trop_result_time"])
    arrival_time = parser.parse(row["Arrival_time"])
    wait_times.append((trop_available - arrival_time).total_seconds())
    
print(len(wait_times))
print(f"MI patients who had first trop neg have to wait {np.mean(wait_times) / 3600} hours until first pos trop")
    

100
MI patients who had first trop neg have to wait 6.244333333333333 hours until first pos trop


In [32]:
# CI difference in populations
# https://www.dummies.com/education/math/statistics/creating-a-confidence-interval-for-the-difference-of-two-means-with-known-standard-deviations/

def diff_in_cols(df_pos, df_neg, col):
    acs_std = df_pos[col].describe()["std"]
    nonacs_std = df_neg[col].describe()["std"]
    ci_diff = 1.96 * math.sqrt(((acs_std ** 2) / df_pos.shape[0]) + ((nonacs_std ** 2) / df_neg.shape[0]))
    diff = df_pos[col].describe()["50%"] - df_neg[col].describe()["50%"] 
    print(f"{col} difference = {round(diff, 3)} [{round(diff - ci_diff, 3)}-{round(diff + ci_diff, 3)}]")


In [33]:
# What's the difference between bedside monitoring lengths?

diff_in_cols(df_pos, df_neg, "trim_length")

trim_length difference = -1403.0 [-1809.623--996.377]


In [34]:
# What's the difference between bedside ED stay?

diff_in_cols(df_pos, df_neg, "ED_LOS")

ED_LOS difference = 0.72 [0.53-0.91]


In [37]:
# How many were admitted to inpatient care?

inpatient_values = [
    "Admit to Inpatient", 
    "Place in Observation", 
    "Place in Observation-CDU", 
    "Transfer to Outside Facility/Hospital", 
    "Transfer to LPCH/PEC"
]

df_pos_inpatient = df_pos[df_pos["ED_dispo"].isin(inpatient_values)]
print(f"For pos, {df_pos_inpatient.shape[0]}/{df_pos.shape[0]} ({100*df_pos_inpatient.shape[0]/df_pos.shape[0]}%) patients were admitted to inpatient")

df_neg_inpatient = df_neg[df_neg["ED_dispo"].isin(inpatient_values)]
print(f"For neg, {df_neg_inpatient.shape[0]}/{df_neg.shape[0]} ({100*df_neg_inpatient.shape[0]/df_neg.shape[0]}%) patients were admitted to inpatient")

diff_in_cols(df_pos, df_neg, "ED_LOS")

For pos, 1009/1057 (95.45884578997162%) patients were admitted to inpatient
For neg, 4543/9817 (46.27686665987573%) patients were admitted to inpatient


```
> prop.test(x=c(1009, 4543), n=c(1057, 9817), correct=FALSE)

	2-sample test for equality of proportions without continuity correction

data:  c(1009, 4543) out of c(1057, 9817)
X-squared = 923.7, df = 1, p-value < 2.2e-16
alternative hypothesis: two.sided
95 percent confidence interval:
 0.4758564 0.5077832
sample estimates:
   prop 1    prop 2 
0.9545885 0.4627687 
```

## Independence

### Validate independence of test set with overall SOB dataset

In [None]:
df_annotated_manual = pd.read_csv("/deep/group/ed-monitor/patient_data_v9/consolidated.filtered.csv")
df_annotated_manual = df_annotated_manual[(df_annotated_manual["Case_for_test"] == 1) | (df_annotated_manual["Case_for_test"] == 0)]
print(df_annotated_manual.shape)
df_annotated_manual.head(1)

In [None]:
df_annotated_manual.describe()

In [None]:
df_annotated_manual_test = pd.read_csv("/deep/group/ed-monitor/patient_data_v9/consolidated.filtered.test.txt", sep="\t")
print(df_annotated_manual_test.shape)
df_annotated_manual_test.head(1)

In [36]:
def print_numeric_stat(df_annotated_manual, df_annotated_manual_test, col):
    x = np.array(df_annotated_manual[col], dtype=np.float)
    y = np.array(df_annotated_manual_test[col], dtype=np.float)
    statistic, pval = stats.ttest_ind(x[~np.isnan(x)], y[~np.isnan(y)], equal_var=False)
    print(f"{col}: {statistic} pval={pval}")

In [37]:
def print_cat_stat(df_annotated_manual, df_annotated_manual_test, col, mappings):
    contingency_table = []
    for k in mappings.keys():
        row = []
        
        count = 0
        for r in df_annotated_manual[col].tolist():
            if r == k:
                count += 1
        row.append(count)

        count = 0
        for r in df_annotated_manual_test[col].tolist():
            if r == k:
                count += 1
        row.append(count)

        contingency_table.append(row)
        
    stat, p, dof, expected = chi2_contingency(contingency_table, correction=False)

    prob = 0.95
    critical = chi2.ppf(prob, dof)
    if abs(stat) >= critical:
        print(f"{col}: {contingency_table} statistically significant")
    else:
        print(f"{col}: {contingency_table} not statistically significant")


In [42]:
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Age")
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Gender", {"F": 0, "M": 1})
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "SpO2")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "RR")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "HR")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Temp")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "SBP")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "DBP")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Max_trop")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "ED_LOS")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop_result_time-waveform_start_time")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop_result_time-Arrival_time")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "trim_length")
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Athero", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "HTN", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "HLD", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "DM", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Obese", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Smoking", {0: 0, 1: 1})

Age: -0.0815471189210677 pval=0.9350139729115687
Gender: [[2111, 712], [2019, 643]] not statistically significant
SpO2: 0.1449359465478856 pval=0.8847745822279938
RR: -0.6037434299757884 pval=0.546077232451806
HR: 0.3043184824916402 pval=0.760912457715388
Temp: -0.8721309776977275 pval=0.3832364753214976
SBP: 0.0050405249274155075 pval=0.9959787034858913
DBP: 0.701384375054689 pval=0.4831334649096156
First_trop: -1.0973691477534222 pval=0.27262655787734447
Max_trop: -1.125362364398196 pval=0.2605823666221569
ED_LOS: -0.07051907298445818 pval=0.943786560802435
First_trop_result_time-waveform_start_time: -0.47989196235282483 pval=0.631350727296526
First_trop_result_time-Arrival_time: 0.24758566168948865 pval=0.8044774535444357
trim_length: 0.023040519027420898 pval=0.9816199461876243
Athero: [[1959, 672], [545, 182]] not statistically significant
HTN: [[1717, 601], [787, 253]] not statistically significant
HLD: [[1837, 662], [667, 192]] statistically significant
DM: [[1890, 636], [614, 2

### Validate independence of val set with overall SOB dataset

In [None]:
df_annotated_manual = pd.read_csv("/deep/group/ed-monitor/patient_data_v9/consolidated.filtered.val.txt", sep="\t")
print(df_annotated_manual.shape)
df_annotated_manual.head(1)

In [None]:
df_annotated_manual.describe()

In [None]:
df_annotated_manual_test = pd.read_csv("/deep/group/ed-monitor/patient_data_v9/consolidated.filtered.test.txt", sep="\t")
print(df_annotated_manual_test.shape)
df_annotated_manual_test.head(1)

In [55]:
def print_numeric_stat(df_annotated_manual, df_annotated_manual_test, col):
    x = np.array(df_annotated_manual[col], dtype=np.float)
    y = np.array(df_annotated_manual_test[col], dtype=np.float)
    statistic, pval = stats.ttest_ind(x[~np.isnan(x)], y[~np.isnan(y)], equal_var=False)
    print(f"{col}: {statistic} pval={pval}")

In [56]:
def print_cat_stat(df_annotated_manual, df_annotated_manual_test, col, mappings):
    contingency_table = []
    for k in mappings.keys():
        row = []
        
        count = 0
        for r in df_annotated_manual[col].tolist():
            if r == k:
                count += 1
        row.append(count)

        count = 0
        for r in df_annotated_manual_test[col].tolist():
            if r == k:
                count += 1
        row.append(count)

        contingency_table.append(row)
        
    stat, p, dof, expected = chi2_contingency(contingency_table, correction=False)

    prob = 0.95
    critical = chi2.ppf(prob, dof)
    if abs(stat) >= critical:
        print(f"{col}: {contingency_table} statistically significant")
    else:
        print(f"{col}: {contingency_table} not statistically significant")


In [57]:
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Age")
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Gender", {"F": 0, "M": 1})
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "SpO2")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "RR")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "HR")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Temp")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "SBP")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "DBP")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "Max_trop")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "ED_LOS")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop_result_time-waveform_start_time")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "First_trop_result_time-Arrival_time")
print_numeric_stat(df_annotated_manual, df_annotated_manual_test, "trim_length")
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Athero", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "HTN", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "HLD", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "DM", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Obese", {0: 0, 1: 1})
print_cat_stat(df_annotated_manual, df_annotated_manual_test, "Smoking", {0: 0, 1: 1})

Age: 0.8206702197157482 pval=0.4119063320485823
Gender: [[687, 712], [669, 643]] not statistically significant
SpO2: 0.28540941971372413 pval=0.775352432076268
RR: -0.8203944979663653 pval=0.41206425705507777
HR: 0.7105116689368924 pval=0.4774480557371259
Temp: -1.1620882336041958 pval=0.24530929885192615
SBP: -0.5521722206055424 pval=0.5808761064154928
DBP: 0.7644176037928548 pval=0.4446850001552457
First_trop: -1.1572396657347122 pval=0.2472950109395037
Max_trop: -1.208118861933213 pval=0.22712047967023463
ED_LOS: 0.38387931099428363 pval=0.7010980832502776
First_trop_result_time-waveform_start_time: -0.8659342409463803 pval=0.3866028890627804
First_trop_result_time-Arrival_time: -0.23935978406527025 pval=0.8108447262454463
trim_length: 0.5780481744617176 pval=0.5632796337214367
Athero: [[616, 672], [184, 182]] not statistically significant
HTN: [[558, 601], [242, 253]] not statistically significant
HLD: [[584, 662], [216, 192]] statistically significant
DM: [[618, 636], [182, 218]] 