In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import patsy 
import math

In [3]:
sepsis_data = pd.read_csv("./sepsis3-df-no-exclusions.csv")
sepsis_data[sepsis_data["sepsis_explicit"]==0].columns

Index(['subject_id', 'icustay_id', 'hadm_id', 'excluded', 'intime', 'outtime',
       'dbsource', 'suspected_infection_time_poe',
       'suspected_infection_time_poe_days', 'specimen_poe',
       'positiveculture_poe', 'antibiotic_time_poe', 'blood_culture_time',
       'blood_culture_positive', 'age', 'gender', 'is_male', 'ethnicity',
       'race_white', 'race_black', 'race_hispanic', 'race_other',
       'metastatic_cancer', 'diabetes', 'height', 'weight', 'bmi',
       'first_service', 'hospital_expire_flag', 'thirtyday_expire_flag',
       'icu_los', 'hosp_los', 'sepsis_angus', 'sepsis_martin',
       'sepsis_explicit', 'septic_shock_explicit', 'severe_sepsis_explicit',
       'sepsis_nqf', 'sepsis_cdc', 'sepsis_cdc_simple', 'elixhauser_hospital',
       'vent', 'sofa', 'lods', 'sirs', 'qsofa', 'qsofa_sysbp_score',
       'qsofa_gcs_score', 'qsofa_resprate_score', 'exclusion_secondarystay',
       'exclusion_nonadult', 'exclusion_csurg', 'exclusion_carevue',
       'exclusion_ear

In [4]:
sepsis_data.intime = pd.to_datetime(sepsis_data.intime)
sepsis_data.outtime = pd.to_datetime(sepsis_data.outtime)


# remove patients with metastatic cancer and diabetes
sepsis_data_rm = sepsis_data[(sepsis_data["diabetes"] == 0) & (sepsis_data["metastatic_cancer"] == 0)]

sepsis_mod = sepsis_data_rm.dropna(subset=["age","gender","weight"]) # Could change later. 
sepsis_mod

Unnamed: 0,subject_id,icustay_id,hadm_id,excluded,intime,outtime,dbsource,suspected_infection_time_poe,suspected_infection_time_poe_days,specimen_poe,...,exclusion_carevue,exclusion_early_suspicion,exclusion_late_suspicion,exclusion_bad_data,composite_outcome,blood culture,suspicion_poe,abx_poe,sepsis-3,sofa>=2
1,27513,200003,163557,1,2199-08-02 19:50:04,2199-08-08 17:09:18,carevue,2199-08-02 21:02:00,-0.049954,BLOOD CULTURE,...,1,0,0,0,1,True,True,True,1,1
3,20707,200007,129310,1,2109-02-17 10:03:37,2109-02-18 17:03:12,carevue,,,,...,1,0,0,0,0,False,False,False,0,0
4,29904,200009,129607,1,2189-11-30 10:34:32,2189-12-02 14:17:37,carevue,,,,...,1,0,0,0,0,False,False,True,0,1
6,93535,200011,121562,1,2188-08-06 01:39:24,2188-08-07 16:50:53,metavision,2188-08-05 21:41:00,0.165556,URINE,...,0,0,0,0,0,True,True,True,1,1
7,28448,200012,177527,1,2153-12-23 05:12:55,2153-12-23 15:55:54,carevue,2153-12-23 00:30:00,0.196470,BLOOD CULTURE,...,1,0,0,0,0,True,True,True,0,0
8,9514,200014,127229,1,2105-02-16 23:16:48,2105-02-18 16:53:29,carevue,2105-02-17 00:00:00,-0.030000,URINE,...,1,0,0,0,0,True,True,True,1,1
9,74032,200016,117458,1,2150-12-02 15:59:20,2150-12-03 14:54:29,metavision,,,,...,0,0,0,0,0,True,False,False,0,0
12,21789,200019,112486,1,2178-07-08 09:03:12,2178-07-11 10:28:40,carevue,2178-07-08 21:51:00,-0.533194,BLOOD CULTURE,...,1,0,0,0,1,True,True,True,1,1
14,61691,200021,109307,0,2114-12-26 19:45:12,2114-12-27 22:46:28,metavision,,,,...,0,0,0,0,0,True,False,False,0,1
19,23650,200029,102161,1,2115-03-29 20:01:10,2115-03-30 01:13:37,carevue,,,,...,1,0,0,0,1,False,False,False,0,1


In [5]:
#sepsis_mod

In [6]:
# Create line plot with time series data (based on stay duration w/respect to study origin)
#fig, ax = plt.subplots()
#for i, val in sepsis_mod.iterrows():
#    dur = (sepsis_mod.iloc[i])['stay_duration_days']
#    s = (sepsis_mod.iloc[i])['corrected_intime']
#    #e = (sepsis_mod.iloc[i])['corrected_outtime']
#    ax.hlines(y=0+(i/10), xmin=s, xmax=s+dur, color='r')
#    plt.show()

### Data Preprocessing



In [8]:

# after filtering, sort remaining dataset

# time_baseline for survival analysis is to be based on the first in_time of the data set:
time_baseline = sepsis_mod.sort_values(by='intime').intime.iloc[0]

sepsis_mod['corrected_intime'] = (sepsis_mod.intime - time_baseline).dt.total_seconds()
sepsis_mod['corrected_outtime'] = (sepsis_mod.outtime - time_baseline).dt.total_seconds()
sepsis_mod['stay_duration_sec'] = (sepsis_mod.corrected_outtime - sepsis_mod.corrected_intime)
sepsis_mod['stay_duration_days'] = sepsis_mod.stay_duration_sec/(3600*24)
sepsis_mod = sepsis_mod.sort_values(by='corrected_intime')

#sepsis_mod = sepsis_mod[['icustay_id','hadm_id','corrected_intime','corrected_outtime','stay_duration_sec','stay_duration_days']].sort_values(by='corrected_intime')
#sepsis_mod[sepsis_mod["thirtyday_expire_flag"]==1].shape

#sepsis_mod[sepsis_mod["stay_duration_days"] >= 30].shape
sepsis_mod.shape


(32862, 66)