In [3]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import sys
import os
from tqdm import tqdm

# 1. for details on data extraction, look at data_extraction.csv

# 2. data preprocessing after extraction

### head view of raw dataset before processing
- Of note, "csn" and "pat_id" are discarded for patient privacy

In [21]:
labs = pd.read_csv("../real_time_sepsis_development/real_time_data/labs_temp.csv")
labs.iloc[:,2:].head()

Unnamed: 0,recorded_time,AST,Alkalinephos,BUN,BaseExcess,Bilirubin_direct,Bilirubin_total,Calcium,Chloride,Creatinine,...,PaCO2,PaO2,Phosphate,Platelets,Potassium,SaO2,Sodium,TroponinI,WBC,pH
0,2021-06-20 20:17:00,18.0,64.0,21.0,,,0.4,9.3,107.0,,...,,,,206.0,,,138.0,,3.6,
1,2020-03-24 12:24:00,29.0,79.0,14.0,,,1.1,9.9,104.0,0.85,...,,,,60.0,5.7,,142.0,,6.5,
2,2020-03-24 14:06:00,,,,,,,,,,...,,,,,4.0,,,,,
3,2020-03-26 03:34:00,,,12.0,,,,8.6,107.0,0.78,...,,,,61.0,3.4,,140.0,,4.1,
4,2021-04-15 15:14:00,,,,,,,7.3,102.0,0.48,...,,,,257.0,,,139.0,,9.2,


In [22]:
vitals = pd.read_csv("../real_time_sepsis_development/real_time_data/vitals_temp.csv")
vitals.iloc[:,2:].head()

Unnamed: 0,recorded_time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2
0,2020-05-16 13:23:00,96.0,95.0,38.2,125.0,,,18.0,
1,2020-05-16 14:28:00,91.0,94.0,,148.0,89.0,,18.0,
2,2020-05-16 14:30:00,88.0,93.0,,148.0,89.0,,18.0,
3,2020-05-16 15:00:00,85.0,93.0,,145.0,86.0,,18.0,
4,2020-05-16 15:31:00,105.0,93.0,,,,,18.0,


### 2.1 basic data cleaning
- discard non-numeric values
- discard elements not within reasonable threshold (outliers mostly due to insert error)
- renaming columns


In [13]:
def data_clean(chunk, thresholds):
    process_cols = list(thresholds.keys())
    for feature in process_cols:
        chunk.loc[:,feature] = chunk[feature].replace(r'\>|\<|\%|\/|\s','',regex=True)
        chunk.loc[:,feature] = pd.to_numeric(chunk[feature], errors='coerce')
        mask_ind = (chunk[feature] < thresholds[feature][1]) & (chunk[feature] > thresholds[feature][0])
        chunk.loc[~mask_ind, feature]  = np.nan
    return vitals

In [None]:

vitals_thresh = { "pulse": (0,250),
                 "spo2": (0,100),
                 "temperature": (25,45),
                 "sbp_cuff": (0,260),
                 "sbp_line": (0,260),
                 "dbp_cuff": (0, 220),
                 "dbp_line": (0, 220),
                 "map_cuff": (0,260),
                 "map_line": (0,260),
                 'unassisted_resp_rate': (0,80),
                 'end_tidal_co2': (0, 60),
                "o2_flow_rate": (0, 1000000)}

vitals_clean = data_clean(vitals, vitals_thresh)

In [16]:
# renaming
vitals_rename = { "pulse": "HR",
                 "spo2": "O2Sat",
                 "temperature": "Temp",
                 "sbp_line": "SBP",
                 "map_line": "MAP",
                 "dbp_line": "DBP",
                 'unassisted_resp_rate': "Resp",
                 'end_tidal_co2': "EtCO2"}

vitals = vitals.rename(columns = vitals_rename)

In [None]:
# format time to datetime
vitals["recorded_time"] = pd.to_datetime(vitals["recorded_time"], format = "%m/%d/%Y %H:%M:%S")

In [None]:
labs_thresh = { "pH": (6.7, 8),
              "PaCO2": (15, 150),
              "SaO2": (0,100),
              "AST": (0, 10000),
              "BUN": (0,200),
              "Alkalinephos": (0, 10000),
              "Calcium": (0,20),
              "Chloride": (60,150),
              "Creatinine": (0, 15),
              "Glucose": (0, 1200),
              "Lactate": (0,20),
              "Magnesium": (0,10), 
              "Phosphate": (0,20),
              "Potassium": (0,10),
              "Bilirubin_total": (0,30),
              "Hct": (0, 75),
               "Hgb": (0,25),
               "PTT": (0,150),
               "WBC": (0,150),
               "Fibrinogen": (0,1000),
               "Platelets": (0,1000)
              }
labs_clean = data_clean(labs,labs_thresh)

In [None]:
labs["recorded_time"] = pd.to_datetime(labs["recorded_time"], format = "%m/%d/%Y %H:%M:%S")

### 2.2 Label change

In [None]:
di = {"Female": 0, "Male": 1}
dem = dem.replace({"gender": di})
dem["gender"] = dem["gender"].fillna(-1)

#### After merging vitals, labs, demographics, times

In [23]:
new_merged = pd.read_csv("../real_time_sepsis_development/real_time_data/merged_longitudinal_0420.csv")

In [25]:
new_merged.iloc[:, 2:].head(10)

Unnamed: 0,recorded_time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,AST,...,gcs_total_score,Awaiting,ED,Floor,ICU,Labor & Delivery,NICU,Nursery,Other,hospital_admission_date_time
0,2020-05-16 13:23:00,96.0,95.0,38.2,125.0,,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
1,2020-05-16 14:28:00,91.0,94.0,,148.0,89.0,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
2,2020-05-16 14:30:00,88.0,93.0,,148.0,89.0,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
3,2020-05-16 15:00:00,85.0,93.0,,145.0,86.0,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
4,2020-05-16 15:31:00,105.0,93.0,,,,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
5,2020-05-16 16:24:00,105.0,,,148.0,,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00
6,2020-05-16 16:32:00,102.0,95.0,38.0,149.0,91.0,,15.0,,,...,,,,,,,,,,2020-05-16 15:29:00
7,2020-05-16 17:25:00,,,,,,,,,,...,,,,,,,,,,2020-05-16 15:29:00
8,2020-05-16 20:18:00,,,37.0,,,,,,,...,,,,,,,,,,2020-05-16 15:29:00
9,2020-05-16 20:22:00,92.0,97.0,37.0,133.0,91.0,,18.0,,,...,,,,,,,,,,2020-05-16 15:29:00


### Please ignore columns ['TroponinI', 'Bilirubin_direct', 'Awaiting', 'ED', 'Floor', 'ICU', 'Labor & Delivery', 'NICU', 'Nursery', 'Other'].
- these are unused in the model

In [28]:
new_merged.columns

Index(['csn', 'pat_id', 'recorded_time', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP',
       'DBP', 'Resp', 'EtCO2', 'AST', 'Alkalinephos', 'BUN', 'BaseExcess',
       'Bilirubin_direct', 'Bilirubin_total', 'Calcium', 'Chloride',
       'Creatinine', 'FiO2', 'Fibrinogen', 'Glucose', 'HCO3', 'Hct', 'Hgb',
       'Lactate', 'Magnesium', 'PTT', 'PaCO2', 'PaO2', 'Phosphate',
       'Platelets', 'Potassium', 'SaO2', 'Sodium', 'TroponinI', 'WBC', 'pH',
       'gcs_total_score', 'Awaiting', 'ED', 'Floor', 'ICU', 'Labor & Delivery',
       'NICU', 'Nursery', 'Other', 'hospital_admission_date_time'],
      dtype='object')

# 3. Resampling longitudinal data (vitals & labs)
### 3.1 will only use first 0-49 hour (relative to hospital admission time) of the data

In [None]:
# set random origin, cutoff at 0-49 hour 
origin = pd.to_datetime("2000-01-01")

new_merged["rel_time"] = ((new_merged["recorded_time"] - new_merged["hospital_admission_date_time"]).dt.total_seconds() / (60 * 60)) <= 49
new_merged = new_merged[new_merged["rel_time"]]
new_merged["rel_time"] = ((new_merged["recorded_time"] - new_merged["hospital_admission_date_time"]).dt.total_seconds() / (60 * 60)) >= 0
new_merged = new_merged[new_merged["rel_time"]]
new_merged["SepsisLabel"] = (new_merged["recorded_time"] >= new_merged["t_sepsis3"])

In [None]:
# sort index
new_merged["rel_time"] = (merged["recorded_time"] - merged["hospital_admission_date_time"]) + origin
new_merged = new_merged.set_index("rel_time").sort_index()

# aggregate each hour using median value

df = new_merged.groupby(by = ["pat_id", "csn"]).resample("1H", label = "right", origin = origin).median()

In [None]:
sorted_df = df.copy()
sorted_df = sorted_df.drop(["csn", "pat_id"],axis=1)
sorted_df = sorted_df.reset_index()
sorted_df["rel_time"] = (sorted_df["rel_time"] - origin).dt.total_seconds() / (60 * 60)

### 3.2 Apply overlapping, rolling median

In [17]:
def rolling_overlap(temp, window, variables, overlap):
    rolled= temp.copy()
    rolled[variables] = rolled.rolling(window, min_periods = 1)[variables].aggregate("median")
    #rolled[bed_var] = rolled.rolling(window, min_periods = 1)[bed_var].aggregate("max")
    rolled = rolled.reset_index(drop = True)
    start = window - 1
    return rolled.iloc[0::overlap]

In [None]:
k = sorted_df.copy()
k = k[k.rel_time >= 1]
k = k[k.rel_time <= 49]

In [None]:
new = k.groupby(["pat_id", "csn"]).apply(lambda v: rolling_overlap(v, 6, variables, 3))

In [None]:
final_df = new.drop(["pat_id", "csn"], axis = 1).reset_index(drop = False).rename(columns = {"level_2" : "LOS"})

### 3.3 merge static data

In [30]:
stat = pd.read_csv("../real_time_sepsis_development/real_time_data/merged_stat_0420.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [34]:
# of note, ignore ['ed_presentation_time', 'hospital_discharge_date_time', 'In_hospital_death', 'year', 'race']. These were for exploratory analysis. 
stat.iloc[:,2:].head()

Unnamed: 0,age,ed_presentation_time,hospital_admission_date_time,hospital_discharge_date_time,In_hospital_death,year,t_sepsis3,race,gender
0,60,NaT,2020-01-05 09:13:00,2020-09-03 18:05:00,False,2020.0,2020-01-04 15:06:00,Caucasian or White,0.0
1,62,2020-01-24 12:14:10,2020-01-24 16:34:00,2021-04-08 17:37:00,False,2020.0,2020-01-24 13:28:00,Multiple,1.0
2,61,NaT,2020-04-15 12:55:00,2020-09-04 17:03:00,False,2020.0,2020-04-19 14:00:00,,
3,65,2020-04-19 16:04:49,2020-04-19 18:37:00,2020-10-07 13:35:00,False,2020.0,2020-04-26 21:00:00,,
4,64,2020-05-01 19:29:11,2020-05-02 15:43:00,2020-09-04 16:30:00,False,2020.0,2020-05-11 21:00:00,Caucasian or White,1.0


In [None]:
final_df = final_df.merge(stat, on = ["pat_id", "csn"], how = "left")

In [None]:
final_df["abs_time"] = final_df["hospital_admission_date_time"] + pd.to_timedelta(final_df['LOS'], unit='h')
final_df["SepsisLabel"] = (final_df["abs_time"] >= final_df["t_sepsis3"])

### 3.4 make 10:1 controls:sepsis ratio

In [None]:
sepsis_48 = list(final_df[final_df["SepsisLabel"] == True].csn.unique())
control_48 = list(set(final_df[final_df["SepsisLabel"] == False].csn.unique()) - set(sepsis_48))
control_random_48 = np.random.choice(control_48,(len(sepsis_48)*10), replace = False)
np.save("control_resampled_48.npy", control_random_48)
np.save("sepsis_48.npy", sepsis_48)
get_csns = list(control_random_48) + list(sepsis_48)

In [None]:
final_df = final_df[final_df.csn.isin(get_csns)]

### 3.5 save

In [36]:
final_df = pd.read_csv("../real_time_sepsis_development/real_time_data/2021_6hr_48_complete_0426.csv")
final_df.iloc[:,2:].head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,LOS,rel_time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,age,ed_presentation_time,hospital_admission_date_time,hospital_discharge_date_time,In_hospital_death,year,t_sepsis3,race,gender,abs_time
0,0,8.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-08-31 21:54:00
1,3,11.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 00:54:00
2,6,14.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 03:54:00
3,9,17.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 06:54:00
4,12,20.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 09:54:00


In [43]:
final_df.columns

Index(['pat_id', 'csn', 'LOS', 'rel_time', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP',
       'DBP', 'Resp', 'EtCO2', 'AST', 'Alkalinephos', 'BUN', 'BaseExcess',
       'Bilirubin_direct', 'Bilirubin_total', 'Calcium', 'Chloride',
       'Creatinine', 'FiO2', 'Fibrinogen', 'Glucose', 'HCO3', 'Hct', 'Hgb',
       'Lactate', 'Magnesium', 'PTT', 'PaCO2', 'PaO2', 'Phosphate',
       'Platelets', 'Potassium', 'SaO2', 'Sodium', 'TroponinI', 'WBC', 'pH',
       'gcs_total_score', 'SepsisLabel', 'age', 'ed_presentation_time',
       'hospital_admission_date_time', 'hospital_discharge_date_time',
       'In_hospital_death', 'year', 't_sepsis3', 'race', 'gender', 'abs_time'],
      dtype='object')

In [None]:
final_df.to_csv("2021_6hr_48_complete_0426.csv", index = False)

# 4. Derived features

In [44]:
df = pd.read_csv("../real_time_sepsis_development/real_time_data/2021_6hr_48_complete_0426.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [47]:
df = df[df.LOS <= 48]
df.iloc[:, 2:].head()

Unnamed: 0,LOS,rel_time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,age,ed_presentation_time,hospital_admission_date_time,hospital_discharge_date_time,In_hospital_death,year,t_sepsis3,race,gender,abs_time
0,0,8.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-08-31 21:54:00
1,3,11.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 00:54:00
2,6,14.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 03:54:00
3,9,17.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 06:54:00
4,12,20.0,,,,,,,,,...,84,2021-08-31 21:55:15,2021-08-31 21:54:00,2021-09-01 04:43:00,False,2021.0,,Caucasian or White,1.0,2021-09-01 09:54:00


In [None]:
sep_r = np.load("./real_time_data/sepsis_48.npy")
control = np.load("./real_time_data/control_resampled_48.npy")
total_csn = list(sep_r) + list(control)

In [None]:
sep_index = ['BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST',
             'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
             'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
             'Bilirubin_total', 'Hct', 'Hgb', 'PTT', 'WBC', 'Platelets']
con_index = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']

# drop three variables due to their massive missing values
# df_process = df.drop(columns=['Bilirubin_direct', 'TroponinI', 'Fibrinogen'])


In [None]:
def feature_informative_missingness(patient, sep_columns = con_index + sep_index):
    
    for sep_column in con_index + sep_index:
        
        nonmissing_idx = patient.index[~patient[sep_column].isna()].tolist()
        f1_name = sep_column + "_interval_f1"
        f2_name = sep_column + "_interval_f2"
        diff_name = sep_column + "_diff"

        patient.loc[nonmissing_idx,f1_name] = np.arange(1,len(nonmissing_idx)+1)
        patient[f1_name] = patient[f1_name].ffill().fillna(0)

        v = (0+patient[sep_column].isna()).replace(0,np.nan)
        cumsum = v.cumsum().fillna(method='pad')
        reset = -cumsum[v.isnull()].diff().fillna(cumsum)
        patient[f2_name] = v.where(v.notnull(), reset).cumsum().fillna(0)
        
        if nonmissing_idx==[]:
            patient.loc[:, f2_name] = -1
        else:
            patient.loc[:nonmissing_idx[0]-1, f2_name] = -1
        
        patient[diff_name] = patient.loc[nonmissing_idx, sep_column].diff()
        patient[diff_name] = patient[diff_name].fillna(method = "ffill")    
            
        
    return patient

In [None]:
def feature_slide_window(vitals):

    diff = vitals.shift(-1) - vitals
    rolling_mean = vitals.groupby("csn").rolling(6, min_periods = 1).mean().reset_index(drop = True)
    rolling_mean = rolling_mean.rename(columns = {'HR': "HR_mean", 'O2Sat': "O2Sat_mean", 'SBP':"SBP_mean", 
                                                  'MAP': "MAP_mean", 'Resp': "Resp_mean"})

    rolling_median = vitals.groupby("csn").rolling(6, min_periods = 1).median().reset_index(drop = True)
    rolling_median = rolling_median.iloc[:,1:].rename(columns = {'HR': "HR_median", 'O2Sat': "O2Sat_median", 'SBP':"SBP_median", 
                                                  'MAP': "MAP_median", 'Resp': "Resp_median"})

    rolling_min = vitals.groupby("csn").rolling(6, min_periods = 1).min().reset_index(drop = True)
    rolling_min = rolling_min.iloc[:,1:].rename(columns = {'HR': "HR_min", 'O2Sat': "O2Sat_min", 'SBP':"SBP_min", 
                                                  'MAP': "MAP_min", 'Resp': "Resp_min"})

    rolling_max = vitals.groupby("csn").rolling(6, min_periods = 1).max().reset_index(drop = True)
    rolling_max = rolling_max.iloc[:,1:].rename(columns = {'HR': "HR_max", 'O2Sat': "O2Sat_max", 'SBP':"SBP_max", 
                                                  'MAP': "MAP_max", 'Resp': "Resp_max"})

    rolling_std = vitals.groupby("csn").rolling(6, min_periods = 1).std().reset_index(drop = True)
    rolling_std = rolling_std.iloc[:,1:].rename(columns = {'HR': "HR_std", 'O2Sat': "O2Sat_std", 'SBP':"SBP_std", 
                                                  'MAP': "MAP_std", 'Resp': "Resp_std"})

    rolling_diff_std = diff.groupby("csn").rolling(6, min_periods = 1).std().reset_index(drop = True)
    rolling_diff_std = rolling_diff_std.iloc[:,1:].rename(columns = {'HR': "HR_dstd", 'O2Sat': "O2Sat_dstd", 'SBP':"SBP_dstd", 
                                                  'MAP': "MAP_dstd", 'Resp': "Resp_dstd"})

    rolling_vitals = pd.concat([rolling_mean, rolling_median, rolling_min, rolling_max, rolling_std, rolling_diff_std], axis = 1)
    
    
    return rolling_vitals
    

In [None]:

def feature_empiric_score(temp):
    
    
    # HEART RATE SCORING
    temp["HR_score"] = 0
    mask = (temp["HR"] <= 40) | (temp["HR"] >= 131)
    temp.loc[mask,"HR_score"] = 3
    mask = (temp["HR"] <= 130) & (temp["HR"] >= 111)
    temp.loc[mask,"HR_score"] = 2
    mask = ((temp["HR"] <= 50) & (temp["HR"] >= 41)) | ((temp["HR"] <= 110) & (temp["HR"] >= 91))
    temp.loc[mask,"HR_score"] = 1
    temp.loc[temp["HR"].isna(),"HR_score"] = np.nan


    # TEMPERATURE SCORING

    temp["Temp_score"] = 0

    mask = (temp["Temp"] <= 35)
    temp.loc[mask,"Temp_score"] = 3
    mask = (temp["Temp"] >= 39.1)
    temp.loc[mask,"Temp_score"] = 2
    mask = ((temp["Temp"] <= 36.0) & (temp["Temp"] >= 35.1)) | ((temp["Temp"] <= 39.0) & (temp["HR"] >= 38.1))
    temp.loc[mask,"Temp_score"] = 1

    temp.loc[temp["Temp"].isna(),"Temp_score"] = np.nan


    # Resp Score

    temp["Resp_score"] = 0

    mask = (temp["Resp"] < 8) | (temp["Resp"] > 25)
    temp.loc[mask,"Resp_score"] = 3
    mask = ((temp["Resp"] <= 24) & (temp["Resp"] >= 21))
    temp.loc[mask,"Resp_score"] = 2
    mask = ((temp["Resp"] <=11) & (temp["Resp"] >= 9))
    temp.loc[mask,"Resp_score"] = 1

    temp.loc[temp["Resp"].isna(),"Resp_score"] = np.nan

    #MAP Score
    temp["MAP_score"] = 1
    mask = (temp["MAP"] >= 70)
    temp.loc[mask, "MAP_score"] = 0
    temp.loc[temp["MAP"].isna(),"MAP_score"] = np.nan
    
    # Creatinine score:

    temp["Creatinine_score"] = 3

    mask = (temp["Creatinine"] < 3.5)
    temp.loc[mask, "Creatinine_score"] = 2
    mask = (temp["Creatinine"] < 2)
    temp.loc[mask, "Creatinine_score"] = 1
    mask = (temp["Creatinine"] < 1.2)
    temp.loc[mask, "Creatinine_score"] = 0
    temp.loc[temp["Creatinine"].isna(),"Creatinine_score"] = np.nan


    # qsofa:
    temp["qsofa"] = 0
    mask = (temp["SBP"] <= 100) & (temp["Resp"] >= 22)
    temp.loc[mask, "qsofa"] = 1
    mask = (temp["SBP"].isna()) | (temp["Resp"].isna())
    temp.loc[mask, "qsofa"] = np.nan

    # Platelets score:
    temp["Platelets_score"] = 0
    mask = (temp["Platelets"] <= 150)
    temp.loc[mask, "Platelets_score"] = 1
    mask = (temp["Platelets"] <= 100)
    temp.loc[mask, "Platelets_score"] = 2
    mask = (temp["Platelets"] <= 50)
    temp.loc[mask, "Platelets_score"] = 3

    temp.loc[temp["Platelets"].isna(),"Platelets_score"] = np.nan



    # Bilirubin score:
    temp["Bilirubin_score"] = 3
    mask = (temp["Bilirubin_total"] < 6)
    temp.loc[mask, "Bilirubin_score"] = 2
    mask = (temp["Bilirubin_total"] < 2)
    temp.loc[mask, "Bilirubin_score"] = 1
    mask = (temp["Bilirubin_total"] < 1.2)
    temp.loc[mask, "Bilirubin_score"] = 0
    temp.loc[temp["Bilirubin_total"].isna(),"Bilirubin_score"] = np.nan
    
    return(temp)


In [None]:
# 62 informative missingness features, 31 differential features and 37 raw variables

def preprocess(df_process):
    
    print("Extracting informative features")
    
    groups = []
    
    with tqdm(total= df_process.csn.nunique()) as pbar:
        for _, case in df_process.groupby(["csn", "pat_id"]):
            groups.append(feature_informative_missingness(case))             
            pbar.update(1)
            
    temp = pd.concat(groups).reset_index(drop=True)  
    
    print("Completed Extracting informative features")

    temp = temp.fillna(method='ffill').reset_index(drop = True)
    print("Extracting Rolling features")
    
    vitals = temp.copy()
    vitals = vitals[["csn", 'HR', 'O2Sat', 'SBP', 'MAP', 'Resp']]
    vitals = feature_slide_window(vitals).reset_index(drop = True).drop(["csn"], axis = 1)
    print("Completed Extracting Rolling features")
    
    new = pd.concat([temp, vitals], axis = 1)
    # add 8 empiric features scorings
    print("Extracting Score Features")
    
    new = feature_empiric_score(new)
    print("Completed Extracting Score Features")
    print("Preprocessing completed with total of", len(list(new.columns)), "features")
        
    return new

In [None]:
processed_df = preprocess(df_process)

In [48]:
processed_df = pd.read_csv("../real_time_sepsis_development/real_time_data/2021_6hr_preprocessed_48_0426.csv")

In [54]:
#processed_df = processed_df.drop(['ed_presentation_time', 'hospital_admission_date_time', 'hospital_discharge_date_time', 'In_hospital_death', 'year', 't_sepsis3', 'race', 'abs_time'], axis = 1)

In [55]:
processed_df.iloc[:,2:]

Unnamed: 0,LOS,rel_time,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,MAP_dstd,Resp_dstd,HR_score,Temp_score,Resp_score,MAP_score,Creatinine_score,qsofa,Platelets_score,Bilirubin_score
0,0,1.0,,,,,,,,,...,,,,,,,,,0.0,0.0
1,3,4.0,,,,,,,,,...,,,,,,,,,0.0,0.0
2,6,7.0,,,,,,,,,...,,,,,,,,,0.0,0.0
3,9,10.0,,,,,,,,,...,,,,,,,,,0.0,0.0
4,12,13.0,,,,,,,,,...,,,,,,,,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201023,3,45.0,104.0,99.0,38.0,120.0,92.0,57.0,18.5,26.0,...,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
201024,6,48.0,96.5,99.0,38.0,121.0,92.0,57.0,18.5,26.0,...,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
201025,0,44.0,80.0,97.0,36.9,130.0,92.0,57.0,17.0,26.0,...,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
201026,0,46.0,76.0,97.0,36.8,133.0,92.0,57.0,15.0,26.0,...,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
list(processed_df.columns)

['pat_id',
 'csn',
 'LOS',
 'rel_time',
 'HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'AST',
 'Alkalinephos',
 'BUN',
 'BaseExcess',
 'Bilirubin_total',
 'Calcium',
 'Chloride',
 'Creatinine',
 'FiO2',
 'Glucose',
 'HCO3',
 'Hct',
 'Hgb',
 'Lactate',
 'Magnesium',
 'PTT',
 'PaCO2',
 'PaO2',
 'Phosphate',
 'Platelets',
 'Potassium',
 'SaO2',
 'Sodium',
 'WBC',
 'pH',
 'gcs_total_score',
 'SepsisLabel',
 'age',
 'gender',
 'HR_interval_f1',
 'HR_interval_f2',
 'HR_diff',
 'O2Sat_interval_f1',
 'O2Sat_interval_f2',
 'O2Sat_diff',
 'Temp_interval_f1',
 'Temp_interval_f2',
 'Temp_diff',
 'SBP_interval_f1',
 'SBP_interval_f2',
 'SBP_diff',
 'MAP_interval_f1',
 'MAP_interval_f2',
 'MAP_diff',
 'DBP_interval_f1',
 'DBP_interval_f2',
 'DBP_diff',
 'Resp_interval_f1',
 'Resp_interval_f2',
 'Resp_diff',
 'EtCO2_interval_f1',
 'EtCO2_interval_f2',
 'EtCO2_diff',
 'BaseExcess_interval_f1',
 'BaseExcess_interval_f2',
 'BaseExcess_diff',
 'HCO3_interval_f1',
 'HCO3_interval_f2

In [None]:
processed_df.to_csv("./real_time_data/2021_6hr_preprocessed_48_0426.csv", index = False)