In [1]:
# Imports
import matplotlib.pyplot as plt
import tqdm
plt.rcParams["axes.grid"] = False #disable white lines which are present in google colab for matplotlib
import numpy as np
import datetime
today = datetime.datetime.today() #To work with datetime values. Only relative time matters in this project, so selecting a random date is not a problem. 
from sklearn.metrics import classification_report,auc,r2_score,matthews_corrcoef
import shap
from catboost import CatBoostClassifier,CatBoostRegressor,Pool
from catboost.utils import get_roc_curve
import xgboost as xgb
import pandas as pd
import re
import os
from scipy.stats import linregress,ttest_ind,ranksums
pd.set_option('display.max_columns', None)

path = "../../../MIMIC_IV/mimic-iv-1.0/"

# CSV preparation for easy extraction of features

In [None]:
df_d_items = pd.read_csv(path+"icu/csv/d_items.csv",sep=',')


In [None]:
df_d_items[df_d_items.label.str.contains("height",regex=True,flags=re.IGNORECASE)]

In [None]:
d_labitems = pd.read_csv(path+"hosp/csv/d_labitems.csv",sep=',')
d_labitems.label = d_labitems.label.fillna("")

In [None]:
d_labitems[d_labitems.label.str.contains("oxygen",regex=True,flags=re.IGNORECASE)]

## CHARTEVENTS

In [None]:
#There are 98 iterations with this code. This will take 2 hours
for chunk in tqdm.tqdm(pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/icu/csv/chartevents.csv",sep=',',
                             chunksize=10000000, 
                             iterator=True,low_memory=False),ascii=True):
        
        for itemid in chunk.itemid.unique():
            path = "Data/MIMIC/mimic-iv-1.0/icu/csv/chartevents/"+str(itemid)+".csv"
            
            if os.path.exists(path):
                chunk[chunk.itemid==itemid].to_csv(path,mode='a',index=False, header=False)
            else:
                chunk[chunk.itemid==itemid].to_csv(path,index=False)
            
        chunk = None

## CHART EVENTS Feature concatenation

In [None]:
#The code loop will take around 7 minutes, saving another 5 minutes
chart_items_af_features = None

chart_items_itemids = [
                    220074, #CVD
                    223834, #O2 Flow
                    220045, #Heart rate
                    220339, #PEEP set
                    224828, #Arterial Base Excess = B.E.
                    220050, #Arterial Blood Pressure systolic
                    224639, #Daily Weight
                    226707, #Height
                    226512, #Admission  Weight (Kg)
                    223835, #Inspired O2 Fraction
                    220210, #respiratory rate monitor
                    220052
                    ] #Arterial blood pressure mean
                       

for itemid in tqdm.tqdm(chart_items_itemids,ascii=True):
    if os.path.exists(path+"icu/csv/chartevents/"+str(itemid)+".csv"):
        if chart_items_af_features is None:
            chart_items_af_features = pd.read_csv(path+"icu/csv/chartevents/"+str(itemid)+".csv")[["subject_id","hadm_id","stay_id","storetime","itemid","value","valuenum","valueuom"]]
        else:
            chart_items_af_features = pd.concat([chart_items_af_features,pd.read_csv(path+"icu/csv/chartevents/"+str(itemid)+".csv")[["subject_id","hadm_id","stay_id","storetime","itemid","value","valuenum","valueuom"]]])
print("SAVING")
chart_items_af_features.to_csv("Data/MIMIC_extracted/chart_items_af_features.csv",index=False)

## LABEVENTS

In [None]:
#There are 98 iterations with this code. This will take 2 hours
for chunk in tqdm.tqdm(pd.read_csv(path+"hosp/csv/labevents.csv",sep=',',
                             chunksize=10000000, 
                             iterator=True,low_memory=False),ascii=True):
        
        for itemid in chunk.itemid.unique():
            path = path+"hosp/csv/labevents/"+str(itemid)+".csv"
            
            if os.path.exists(path):
                chunk[chunk.itemid==itemid].to_csv(path,mode='a',index=False, header=False)
            else:
                chunk[chunk.itemid==itemid].to_csv(path,index=False)
            
        chunk = None

## LAB EVENTS Feature concatenation

In [None]:
#The code loop will take around 7 minutes, saving another 5 minutes
lab_items_af_features = None

lab_items_itemids = [51265, #Platelet count
                     50889, #C-reactive protein blood
                     51275, #PTT (but is actually APTT) blood
                     50821, #PO2 blood
                     50818, #Pco2 monitor blood
                     50820, #pH blood
                     50882, #Bicarbonate (HCO3)
                     50813, #Lactate blood
                     51222, #Hemoglobin blood
                     50817, #Oxygen Saturation
                     50970, #phosphate blood
                     51006] #urea nitrogen blood
                       

for itemid in tqdm.tqdm(lab_items_itemids,ascii=True):
    if os.path.exists(path+"hosp/csv/labevents/"+str(itemid)+".csv"):
        if lab_items_af_features is None:
            lab_items_af_features = pd.read_csv(path+"hosp/csv/labevents/"+str(itemid)+".csv")[["subject_id","hadm_id","storetime","itemid","value","valuenum","valueuom"]]
        else:
            lab_items_af_features = pd.concat([lab_items_af_features,pd.read_csv(path+"hosp/csv/labevents/"+str(itemid)+".csv")[["subject_id","hadm_id","storetime","itemid","value","valuenum","valueuom"]]])
print("SAVING")
lab_items_af_features.to_csv("Data/MIMIC_extracted/lab_items_af_features.csv",index=False)

## Outputevents/inputevents Feature Concatenation
Fluid-balance should be directly calculated from inputevents and outputevents while creating features as the dataset is ready for a simple window application method. Inputevents has mixed types and only the ml is selected. For outputevents this is all ml

In [None]:
output_af_features = None

for chunk in tqdm.tqdm(pd.read_csv(path+"icu/csv/outputevents.csv",sep=',',
                             chunksize=10000000, 
                             iterator=True,low_memory=False),ascii=True):
        
        if output_af_features is not None :
            output_af_features = output_af_features.append(chunk[chunk.itemid==226559])
        else:
            output_af_features = chunk[chunk.itemid==226559]
            
output_af_features.to_csv("Data/MIMIC_extracted/output_af_features.csv",index=False)

In [None]:
inp_ev = pd.read_csv(path+"icu/csv/inputevents.csv",sep=',',
                             nrows=100000)
inp_ev[(inp_ev.amountuom=="ml") | (inp_ev.amountuom=="L") | (inp_ev.amountuom=="uL")].to_csv("Data/MIMIC_extracted/input_fluid_af_features.csv",index=False)

In [None]:
out_ev = pd.read_csv(path+"/icu/csv/outputevents.csv",sep=',',
                             nrows=100000)

# Inputevents concatenation

In [None]:
emar_medications_r = r'Magnesium Sulfate|Calcium Carbonate|Calcium Acetate|Calcium Gluconate|bumetanide|furosemide|norepinephrine|propofol|fentanyl citrate|dopamine|fentanyl'
df_d_items[(df_d_items.label.str.contains(emar_medications_r,flags=re.IGNORECASE, regex=True))][["itemid","label"]]

In [None]:
inputevents_medication_features = [
    221456,# 	Calcium Gluconate
    221662,# 	Dopamine
    221794,# 	Furosemide (Lasix)
    221906,# 	Norepinephrine
    222011,# 	Magnesium Sulfate
    222168,# 	Propofol
    227210,# 	Propofol (Intubation)
    227523,# 	Magnesium Sulfate (Bolus)
    221744,#    Fentanyl
    227524,# 	Magnesium Sulfate (OB-GYN)
    227525,# 	Calcium Gluconate (CRRT)
    228317,# 	Calcium Gluconate (Bolus)_OLD_1
    228340,# 	Furosemide (Lasix) 250/50
    229639,# 	Bumetanide (Bumex)
    229640# 	Calcium Gluconate (Bolus)
    ]

inputevents = pd.read_csv(path+"icu/csv/inputevents.csv")
df_AF_medication = inputevents[inputevents.itemid.isin(inputevents_medication_features)][[
                            'subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'rateuom',
                               'itemid', 'amount',  'rate',  'totalamount'
                        ]]

df_AF_medication.loc[(~df_AF_medication.rate.isna())&(df_AF_medication.rateuom.str.contains("hour")),"rate"] = df_AF_medication[(~df_AF_medication.rate.isna())&(df_AF_medication.rateuom.str.contains("hour"))].rate * 60

df_AF_medication.to_csv("Data/MIMIC_extracted/df_AF_inputevents_medication.csv",index=False)

In [None]:
inputevents = pd.read_csv(path+"icu/csv/inputevents.csv")


## Medication EVENTS Feature concatenation

In [None]:
emar_medications_r = r'Magnesium Sulfate|Calcium Carbonate|Calcium Acetate|Calcium Gluconate|bumetanide|furosemide|norepinephrine|propofol|fentanyl citrate|dopamine|Fentanyl'
emar_medications = ['Magnesium Sulfate',
                    'Propofol', 'Fentanyl Citrate', 'NORepinephrine','Norepinephrine', 'Furosemide',
                    'Calcium Carbonate', 'Calcium Gluconate','Calcium Acetate','Fentanyl',
                    'DOPamine', 'Bumetanide','Calcium Carbonate Suspension',
                    'furosemide', 'fentaNYL citrate', ]

not_wanted_events = [ 'Not Started', 'Stopped - Unscheduled',
       'Not Given', 'Delayed Stopped',
       'Delayed Not Started', 'Delayed Not Confirmed']

In [None]:
df_emar = pd.read_csv(path+"hosp/csv/emar.csv",sep=',')
df_emar = df_emar[~df_emar.event_txt.isin(not_wanted_events)]
df_emar = df_emar[df_emar.medication.isin(emar_medications)][['subject_id', 'hadm_id', 'emar_id',
       'charttime', 'medication', 'scheduletime', 'storetime']]
df_emar.loc[df_emar.medication=="Norepinephrine","medication"]='NORepinephrine'
df_emar.to_csv("Data/MIMIC_extracted/emar_extracted.csv",index=False)

In [None]:
df_emar = pd.read_csv("Data/MIMIC_extracted/emar_extracted.csv")
df_emar_detail = None

for chunk in tqdm.tqdm(pd.read_csv(path+"hosp/csv/emar_detail.csv",sep=',',
                             chunksize=10000000, 
                             iterator=True,low_memory=False),ascii=True):
        
        if df_emar_detail is not None :
            df_emar_detail = df_emar_detail.append(chunk[chunk.emar_id.isin(df_emar.emar_id.unique())])
        else:
            df_emar_detail = chunk[chunk.emar_id.isin(df_emar.emar_id.unique())]
            
    
df_emar_detail = df_emar_detail
df_emar_detail = df_emar_detail[~df_emar_detail.dose_given.isna()]
df_emar_detail.to_csv("Data/MIMIC_extracted/emar_detail_extracted.csv",index=False)

In [None]:
df_emar = pd.read_csv("Data/MIMIC_extracted/emar_extracted.csv")
df_emar_detail = pd.read_csv("Data/MIMIC_extracted/emar_detail_extracted.csv")
df_AF_medication = df_emar.merge(df_emar_detail,on=["subject_id","emar_id"],how="left")

In [None]:
df_AF_medication.medication.value_counts()

In [None]:
df_emar = pd.read_csv(path+"hosp/csv/emar.csv",sep=',')

In [None]:
df_icustay = pd.read_csv(path+"icu/csv/icustays.csv",sep=',')

In [None]:
df_emar = df_emar[df_emar.subject_id.isin(df_icustay.subject_id)]
emar_meds_ser = pd.Series(df_emar.medication.value_counts()).reset_index()
emar_meds_ser = emar_meds_ser.rename(columns={"index":"medication","medication":"count"})

emar_meds_ser[(~emar_meds_ser.medication.isna()) & (emar_meds_ser.medication.str.contains("amida",flags=re.IGNORECASE,regex=True))]

In [None]:
df_AF_medication[df_AF_medication.medication.str.contains("nor",flags=re.IGNORECASE,regex=True)]

In [None]:
df_AF_medication.to_csv("Data/MIMIC_extracted/df_AF_medication.csv",index=False)

In [None]:
df_AF_medication = pd.read_csv(path+"hosp/csv/df_AF_medication.csv")
df_nor = pd.read_csv(path+"mimic_norepinephrine.csv")

In [None]:
len(df_nor.stay_id.unique())

In [None]:
len(df_AF_medication[df_AF_medication.medication.str.contains("NORepine")].hadm_id.unique())

# Dictionary 

In [None]:
import amsterdamumcdb as adb
dictionary = adb.get_dictionary()
dictionary[dictionary.item.str.contains("fenta",flags=re.IGNORECASE,regex=True)]

In [None]:
df_emar = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/emar.csv",sep=',')

In [None]:
df_emar.columns

In [None]:
df_emar = df_emar[df_emar.subject_id.isin(df_icustay.subject_id)]
emar_meds_ser = pd.Series(df_emar.medication.value_counts()).reset_index()
emar_meds_ser = emar_meds_ser.rename(columns={"index":"medication","medication":"count"})

In [None]:
emar_meds_ser[(~emar_meds_ser.medication.isna()) & (emar_meds_ser.medication.str.contains("nore",flags=re.IGNORECASE,regex=True))]

In [None]:
df_d_items = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/icu/csv/d_items.csv",sep=',')
df_d_items

In [None]:
df_d_items[(df_d_items.label.str.contains(r'NOR',flags=re.IGNORECASE, regex=True))]

In [None]:
df_d_items[(df_d_items.label.str.contains('SOFA',flags=re.IGNORECASE, regex=True))]

In [None]:
df_d_labitems = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/d_labitems.csv",sep=',')
df_d_labitems

In [None]:
df_d_labitems[(~df_d_labitems.label.isnull())&(df_d_labitems.label.str.contains("urea",flags=re.IGNORECASE, regex=True))]
#df_d_labitems[(~df_d_labitems.label.isnull())&(df_d_labitems.fluid.str.contains("Blood"))&(df_d_labitems.category.str.contains("Hematology"))][240:290]#&(df_d_labitems.label.str.contains("lets"))]

## AF patients preprocessing

In [None]:
df_AF = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/AF.csv",sep=',')

In [None]:
df_adm = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/core/csv/admissions.csv",sep=',')

In [None]:
df_icustay = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/icu/csv/icustays.csv",sep=',')

In [None]:
#Create admissioncount
df_icustay.intime = pd.to_datetime(df_icustay.intime)
df_icustay = df_icustay.sort_values(["subject_id","intime"]).reset_index(drop=True)
df_icustay["admissioncount"]=0
for i in range(len(df_icustay)):
    if i > 0:
        if df_icustay.iloc[i]["subject_id"] == df_icustay.iloc[i-1]["subject_id"]:
            df_icustay.loc[df_icustay.index==i,"admissioncount"]=df_icustay.iloc[i-1]["admissioncount"]+1

In [None]:
df_patients = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/core/csv/patients.csv",sep=',')

In [None]:
AF_hadm = df_AF.merge(df_adm[df_adm.subject_id.isin(df_icustay.subject_id.unique())],on=['hadm_id',"subject_id"])

In [None]:
AF_hadm["diff_to_adm"] = pd.to_datetime(AF_hadm.storetime) - pd.to_datetime(AF_hadm.admittime)

In [None]:
df_adm = df_adm.merge(df_icustay,how="right",on=["subject_id","hadm_id"])

In [None]:
patient_AF_episode = df_AF.sort_values(["hadm_id","stay_id","storetime"]).groupby("stay_id").first().reset_index()

In [None]:
df_adm["AF"]=0
df_adm.loc[df_adm.stay_id.isin(patient_AF_episode.stay_id.unique()),"AF"]=1

In [None]:
df_adm["AF"]=0
df_adm.loc[df_adm.stay_id.isin(patient_AF_episode.stay_id.unique()),"AF"]=1

df_adm["AF_measuredat"] = 0
for stay_id in patient_AF_episode.stay_id.unique():
    df_adm.loc[df_adm.stay_id==stay_id,"AF_measuredat"]=patient_AF_episode[patient_AF_episode.stay_id==stay_id].storetime.values

In [None]:
df_adm[df_adm.AF==1]

In [None]:
early_onset_pat = AF_hadm[AF_hadm.diff_to_adm<pd.to_timedelta(12,unit='h')].subject_id.unique()

In [None]:
df_diag = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/diagnoses_icd.csv",sep=',')
df_proced = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/procedures_icd.csv",sep=',')

In [None]:
len(df_services[df_services.curr_service.str.contains("CSURG")].hadm_id.unique())

In [None]:
df_diag_icd = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/d_icd_diagnoses.csv",sep=',')
df_proced_icd = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/d_icd_procedures.csv",sep=',')

re_cardiosurg_new = (
    r'(CABG|AVR|heart surgery|cardiovascular disease|heart valves|'
    r'MVP|MVR|mitral|tricuspid|pericard|aortic.*valve|Bentall|'
    r'myocardial infarcation|VSR|ASD|intracardiac|congenital defect repair)(?! for esophag)'
)

re_sepsis_surg = r'sepsis|pneumoni|GI perforation|perforation|rupture|infection|abscess|GI Vascular ischemia|diverticular|appendectomy|peritonitis'
re_sepsis_med = r'sepsis|septic|infect|pneumoni|cholangitis|pancr|endocarditis|meningitis|GI perforation|abces|abscess|colon ischemi|GI vascular|fasciitis|inflammatory|peritonitis'


surg_icds = df_proced_icd[df_proced_icd.long_title.str.contains("surgery|surg", na=False,flags=re.IGNORECASE,regex=True)].icd_code
card_icds = df_diag_icd[df_diag_icd.long_title.str.contains(re_cardiosurg_new, na=False,flags=re.IGNORECASE,regex=True)].icd_code
seps = df_diag_icd[df_diag_icd.long_title.str.contains("sepsis|seps", na=False,flags=re.IGNORECASE,regex=True)].icd_code

In [None]:
card_hadms = df_diag[(df_diag.icd_code.isin(card_icds))].hadm_id.unique()
#surg_hadms = df_diag[(df_diag.icd_code.isin(surg_icds))].hadm_id.unique()
surg_hadms = df_diag[(df_diag.icd_code.isin(surg_icds))].hadm_id.unique()
seps_hadms = df_diag[(df_diag.icd_code.isin(seps))].hadm_id.unique()

In [None]:
df_services = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/services.csv",sep=',')

In [None]:
df_services[df_services.hadm_id==29988601]

In [None]:
card_hadms = df_services[df_services.curr_service.str.contains(r"CMED|CSURG",regex=True)].hadm_id.unique()
surg_hadms = df_services[df_services.curr_service.str.contains(r"NSURG|CSURG|PSURG|SURG|TSURG|VSURG",regex=True)].hadm_id.unique()
medical_hadms = df_services[df_services.curr_service.str.contains(r"CMED|MED|NMED|OMED|PSYCH",regex=True)].hadm_id.unique()
df_sepsis3 = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/mimic_sepsis3.csv",sep=',')
seps_hadms = df_sepsis3.stay_id.unique()

In [None]:
df_adm[(df_adm.hadm_id.isin(medical_hadms))&(df_adm.hadm_id.isin(surg_hadms))]

In [None]:
df_adm["sepsis_bool"] = 0
df_adm.loc[df_adm.stay_id.isin(seps_hadms),"sepsis_bool"]=1

df_adm["surgery_bool"] = 0 
df_adm.loc[df_adm.hadm_id.isin(surg_hadms),"surgery_bool"]=1

df_adm["cardiac_bool"] = 0
df_adm.loc[df_adm.hadm_id.isin(card_hadms),"cardiac_bool"]=1

df_adm["medical_bool"] = 0
df_adm.loc[df_adm.hadm_id.isin(medical_hadms),"medical_bool"]=1

df_adm["cardiac_bool_new"] = 0
df_adm.loc[(df_adm.cardiac_bool==1) & (df_adm.surgery_bool ==1),"cardiac_bool_new" ]=1

In [None]:
df_adm[(df_adm.subject_id.isin(df_icustay.subject_id.unique()))].describe()

In [None]:
df_patients = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/core/csv/patients.csv",sep=',')

In [None]:
df_patients.columns

In [None]:
df_adm_icu = df_adm[(df_adm.subject_id.isin(df_icustay.subject_id.unique()))]
df_adm_icu.to_csv("Data/MIMIC/mimic-iv-1.0/df_adm_icu.csv",index=False)

In [None]:
df_adm_icu

In [None]:
df_adm_af = pd.read_csv("Data/MIMIC/mimic-iv-1.0/df_adm_icu.csv")

In [None]:
plt.hist(df_adm_af[df_adm_af.AF==0].hospital_expire_flag,density=True,label="NO AF")
plt.hist(df_adm_af[df_adm_af.AF==1].hospital_expire_flag,density=True,label="AF",alpha=0.5)
plt.legend()

In [None]:
plt.hist(df_adm_af[df_adm_af.AF==0].los,density=True,label="NO AF",bins=np.arange(0,50,2))
plt.hist(df_adm_af[df_adm_af.AF==1].los,density=True,label="AF",alpha=0.5,bins=np.arange(0,50,2))
plt.legend()

In [None]:
df_d_items = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/icu/csv/d_items.csv",sep=',')

In [None]:
df_d_items

In [None]:
df_d_items[df_d_items.label.str.contains("vanco",flags=re.IGNORECASE,regex=True)]

In [None]:
df_inputevents = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/icu/csv/inputevents.csv",sep=',')

In [None]:
df_inp_vanco = df_inputevents[df_inputevents.itemid==225798]

In [None]:
df_inp_vanco

In [None]:
df_d_labitems = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/d_labitems.csv",sep=',')
df_d_labitems.label = df_d_labitems.label.fillna("")

In [None]:
df_d_labitems[df_d_labitems.label.str.contains("vanco",flags=re.IGNORECASE,regex=True)]

In [None]:
df_vanco = pd.read_csv(r"Data/MIMIC/mimic-iv-1.0/hosp/csv/labevents/51009.csv",sep=',',low_memory=False)

In [None]:
df_vanco

# Data preparation
Start executing here to get the relevent datasets

In [2]:
# Imports
import matplotlib.pyplot as plt
from tqdm import tqdm
plt.rcParams["axes.grid"] = False #disable ugly white lines which are present in google colab for matplotlib
import numpy as np
import datetime
today = datetime.datetime.today() #To work with datetime values. Only relative time matters in this project, so selecting a random date is not a problem. 
from sklearn.metrics import classification_report,auc,r2_score,matthews_corrcoef
import shap
from catboost import CatBoostClassifier,CatBoostRegressor,Pool
from catboost.utils import get_roc_curve
import pandas as pd
import re
import os
from scipy.stats import linregress,ttest_ind,ranksums
mimic_extracted_path = "../../IC_DC_AF/Data/MIMIC_extracted/"
mimic_base_path = "../../../MIMIC_IV/mimic-iv-1.0/"
from tabulate import tabulate

## Admission and patient preparation

In [3]:
admissions_pd = pd.read_csv(mimic_extracted_path+"df_adm_icu.csv",sep=',')
admissions_pd = admissions_pd[['subject_id', 'hadm_id', 'admittime', 'AF', 'AF_measuredat','intime','outtime',
       'sepsis_bool', 'surgery_bool', 'cardiac_bool', 'cardiac_bool_new',
       'stay_id', 'los', 'admissioncount']]
admissions_pd = admissions_pd.rename(columns={"subject_id":"patientid","stay_id":"admissionid","los":"lengthofstay"})
admissions_pd.loc[:,"intime"] = pd.to_datetime(admissions_pd.intime)
admissions_pd.loc[:,"outtime"] = pd.to_datetime(admissions_pd.outtime)
admissions_pd.loc[:,"lengthofstay"]=admissions_pd.lengthofstay*24
admissions_pd.loc[admissions_pd.AF_measuredat=="0","AF_measuredat"]=0
admissions_pd.loc[admissions_pd.AF_measuredat!=0,"AF_measuredat"]=(pd.to_datetime(admissions_pd[admissions_pd.AF_measuredat!=0].AF_measuredat)-pd.to_datetime(admissions_pd[admissions_pd.AF_measuredat!=0].intime)).dt.total_seconds()

In [4]:
#False
#True

one_half_hour_model = True
non_biased_model = False #Match the NO AF measurement point distribution to the AF patients to avoid time-dependent treatment bias
six_hour_model = False

if one_half_hour_model:
    time_shift = 1.5
    margin_time = 1.5
else:
    time_shift = 12
    margin_time = 0

if six_hour_model:
    time_shift = 6
    margin_time = 0
    
hours_to_first_AF = 12
total_window = hours_to_first_AF+time_shift
total_window

to_hour_multiplier = 60*60

In [5]:
only_data_of_first_hours = False #Use only data of the first X hours of admission

total_window = hours_to_first_AF+time_shift

np.random.seed(42)

admission_pd_min_hour = admissions_pd[(admissions_pd.lengthofstay>=total_window)].copy(deep=True)

admission_pd_min_hour = admission_pd_min_hour.sample(len(admission_pd_min_hour),random_state=42)
admission_pd_min_hour = admission_pd_min_hour[(admission_pd_min_hour.AF==0)|(admission_pd_min_hour.AF_measuredat>=to_hour_multiplier*total_window)]

if non_biased_model:
    AF_measuredat_sample_df = admission_pd_min_hour[(admission_pd_min_hour.AF==1)&(admission_pd_min_hour.AF_measuredat>to_hour_multiplier*(total_window))][["admissionid","AF_measuredat"]].copy(deep=True)
    admission_pd_min_hour["date_corresponds_to_AF_admid"] = admission_pd_min_hour.admissionid
    for admissionid in admission_pd_min_hour[admission_pd_min_hour.AF==0].admissionid.values:
        if len(AF_measuredat_sample_df) == 0:
            break
        else:
            if len(AF_measuredat_sample_df[AF_measuredat_sample_df.AF_measuredat<=((admission_pd_min_hour[admission_pd_min_hour.admissionid==admissionid]['lengthofstay'].values[0])*to_hour_multiplier)])>0:
                random_state_admission = np.random.RandomState(admissionid)
                choice = random_state_admission.choice(AF_measuredat_sample_df[AF_measuredat_sample_df.AF_measuredat<=((admission_pd_min_hour[admission_pd_min_hour.admissionid==admissionid]['lengthofstay'].values[0]+0.1)*to_hour_multiplier)]['admissionid'].values)
                admission_pd_min_hour.loc[admission_pd_min_hour.admissionid==admissionid,"AF_measuredat"] = AF_measuredat_sample_df[AF_measuredat_sample_df.admissionid==choice].AF_measuredat.values[0]
                admission_pd_min_hour.loc[admission_pd_min_hour.admissionid==admissionid,"date_corresponds_to_AF_admid"] = AF_measuredat_sample_df[AF_measuredat_sample_df.admissionid==choice].admissionid.values[0]
                AF_measuredat_sample_df = AF_measuredat_sample_df[AF_measuredat_sample_df.admissionid!=choice]   
    admission_pd_min_hour.loc[:,"AF_measuredat"] = admission_pd_min_hour.apply(lambda row: to_hour_multiplier*np.random.randint(total_window,row['lengthofstay']+1) if ((row['AF']==0)&((row['AF_measuredat'] == 0)|( pd.isnull(row['AF_measuredat'])))) else row['AF_measuredat'],axis=1).values 

else:
    admission_pd_min_hour.loc[:,"AF_measuredat"] = admission_pd_min_hour.apply(lambda row: to_hour_multiplier*np.random.randint(total_window,row['lengthofstay']+1) if ((row['AF']==0)&((row['AF_measuredat'] == 0)|( pd.isnull(row['AF_measuredat'])))) else row['AF_measuredat'],axis=1).values 



admission_pd_min_hour["AF_orig"]=admission_pd_min_hour.AF
AF_admission_dataset = admission_pd_min_hour[admission_pd_min_hour.AF_measuredat>=total_window*to_hour_multiplier]

first_AFs_pd_timed = None
temp_pd = None
admission_pd_min_hour = None

## Feature extraction chartitems

In [9]:
def linreg_except(x,value,measuredat):
    try:
        return linregress(x[measuredat],x[value])[0]
    except:
        return np.nan

In [10]:
numeric_items_pd = pd.read_csv(mimic_extracted_path+"chart_items_af_features.csv",sep=',')
numeric_items_pd.loc[:,"value"]=numeric_items_pd.valuenum
numeric_items_pd = numeric_items_pd.rename(columns={"storetime":"measuredat","stay_id":"admissionid","subject_id":"patientid"})
numeric_items_pd = numeric_items_pd[['patientid', 'admissionid', 'measuredat', 'itemid', 'value']]

df_d_items = pd.read_csv(mimic_base_path+"icu/csv/d_items.csv",sep=',')

Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.


In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    numeric_pd_patients = (numeric_items_pd[numeric_items_pd.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","date_corresponds_to_AF_admid","intime"]],how='left',on='admissionid')
else:
    numeric_pd_patients = (numeric_items_pd[numeric_items_pd.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","intime"]],how='left',on='admissionid')

#MIMIC specific preprocessing
numeric_pd_patients.loc[:,"measuredat"]=(pd.to_datetime(numeric_pd_patients.measuredat)-numeric_pd_patients.intime).dt.total_seconds() #INTIME is already in datetime
numeric_pd_patients["measuredat_min"] = np.floor(numeric_pd_patients.measuredat/60) #per minute
numeric_pd_patients = numeric_pd_patients.drop(columns=["intime"])

numeric_pd_patients["time_to_AF"]=(numeric_pd_patients.AF_measuredat.values-60*60*margin_time) - numeric_pd_patients.measuredat.values #add one margin_time to AF extra
numeric_pd_patients = numeric_pd_patients[(numeric_pd_patients.time_to_AF > (time_shift-margin_time)*60*60) & (numeric_pd_patients.time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) )]

for itemid_loop in numeric_items_pd.itemid.unique():
    numeric_pd_patients.itemid = numeric_pd_patients.itemid.replace(itemid_loop,df_d_items[df_d_items.itemid==itemid_loop].label.values[0])

numeric_items_pd = None #RAM Optimization

numeric_pd_patients_agg = numeric_pd_patients[["admissionid","itemid","value"]].groupby(["admissionid","itemid"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
numeric_pd_patients_agg.itemid = numeric_pd_patients_agg.itemid.astype(str)
numeric_pd_patients_agg.columns = ['_'.join(col).rstrip('_') for col in numeric_pd_patients_agg.columns.values]
numeric_pd_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in numeric_pd_patients_agg.columns.values]
numeric_pd_patients_agg = numeric_pd_patients_agg.pivot(index='admissionid', columns='itemid')
numeric_pd_patients_agg.columns = ['_'.join(col).rstrip('_') for col in numeric_pd_patients_agg.columns.values]



numeric_pd_patients_slope = numeric_pd_patients[["admissionid","itemid","measuredat_min","value"]].groupby(["admissionid","itemid"]).apply(lambda x:linreg_except(x,"value","measuredat_min")).reset_index()
numeric_pd_patients_slope.columns = [str(col) for col in numeric_pd_patients_slope.columns.values]
numeric_pd_patients_slope = numeric_pd_patients_slope.rename(columns={'0':"slope"})
numeric_pd_patients_slope.itemid = numeric_pd_patients_slope.itemid.astype(str)
numeric_pd_patients_slope = numeric_pd_patients_slope.pivot(index='admissionid', columns='itemid')
numeric_pd_patients_slope.columns = ['_'.join(col).rstrip('_') for col in numeric_pd_patients_slope.columns.values]
numeric_pd_patients_slope = numeric_pd_patients_slope.reset_index()

numeric_pd_patients_total = numeric_pd_patients_agg.merge(numeric_pd_patients_slope,how='left',on='admissionid')
numeric_pd_patients = None #Save RAM
numeric_pd_patients_slope = None
numeric_pd_patients_agg = None

## Feature extraction Labitems

In [None]:
lab_items_af_features = pd.read_csv(mimic_extracted_path+"lab_items_af_features.csv",sep=',')
lab_items_af_features = lab_items_af_features[lab_items_af_features.subject_id.isin(AF_admission_dataset.patientid.values)].reset_index(drop=True)
lab_items_af_features.loc[:,"storetime"]=pd.to_datetime(lab_items_af_features.storetime)
lab_items_af_features.loc[:,"value"]=lab_items_af_features.valuenum
lab_items_af_features = lab_items_af_features.rename(columns={"subject_id":"patientid","storetime":"measuredat"})

df_d_labitems = pd.read_csv(mimic_base_path+"hosp/csv/d_labitems.csv",sep=',')
lab_items_af_features

In [None]:
lab_items_patients = lab_items_af_features.merge(AF_admission_dataset,how="left",on="patientid")
lab_items_patients = lab_items_patients[(lab_items_patients.measuredat>=lab_items_patients.intime)&(lab_items_patients.measuredat<=lab_items_patients.outtime)]

#MIMIC specific preprocessing
lab_items_patients.loc[:,"measuredat"]=(pd.to_datetime(lab_items_patients.measuredat)-lab_items_patients.intime).dt.total_seconds() #INTIME is already in datetime
lab_items_patients["measuredat_min"] = np.floor(lab_items_patients.measuredat/60) #per minute
lab_items_patients = lab_items_patients.drop(columns=["intime","outtime"])

lab_items_patients["time_to_AF"]=(lab_items_patients.AF_measuredat.values-60*60*margin_time) - lab_items_patients.measuredat.values #add one margin_time to AF extra
lab_items_patients = lab_items_patients[(lab_items_patients.time_to_AF > (time_shift-margin_time)*60*60) & (lab_items_patients.time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) )]

for itemid_loop in lab_items_af_features.itemid.unique():
    lab_items_patients.itemid = lab_items_patients.itemid.replace(itemid_loop,df_d_labitems[df_d_labitems.itemid==itemid_loop].label.values[0])

lab_items_af_features = None #RAM Optimization

lab_items_patients_agg = lab_items_patients[["admissionid","itemid","value"]].groupby(["admissionid","itemid"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
lab_items_patients_agg.itemid = lab_items_patients_agg.itemid.astype(str)
lab_items_patients_agg.columns = ['_'.join(col).rstrip('_') for col in lab_items_patients_agg.columns.values]
lab_items_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in lab_items_patients_agg.columns.values]
lab_items_patients_agg = lab_items_patients_agg.pivot(index='admissionid', columns='itemid')
lab_items_patients_agg.columns = ['_'.join(col).rstrip('_') for col in lab_items_patients_agg.columns.values]

lab_items_patients_slope = lab_items_patients[["admissionid","itemid","measuredat_min","value"]].groupby(["admissionid","itemid"]).apply(lambda x:linreg_except(x,"value","measuredat_min")).reset_index()
lab_items_patients_slope.columns = [str(col) for col in lab_items_patients_slope.columns.values]
lab_items_patients_slope = lab_items_patients_slope.rename(columns={'0':"slope"})
lab_items_patients_slope.itemid = lab_items_patients_slope.itemid.astype(str)
lab_items_patients_slope = lab_items_patients_slope.pivot(index='admissionid', columns='itemid')
lab_items_patients_slope.columns = ['_'.join(col).rstrip('_') for col in lab_items_patients_slope.columns.values]
lab_items_patients_slope = lab_items_patients_slope.reset_index()

lab_items_patients_total = lab_items_patients_agg.merge(lab_items_patients_slope,how='left',on='admissionid')
lab_items_patients = None #Save RAM
lab_items_patients_slope = None
lab_items_patients_agg = None

In [None]:
lab_items_patients_total

## Feature extraction Medication

In [None]:
df_AF_medication = pd.read_csv(mimic_extracted_path+"df_AF_medication.csv",sep=',')

df_AF_medication.loc[:,"storetime"] = pd.to_datetime(df_AF_medication.storetime)
df_AF_medication = df_AF_medication.rename(columns={"storetime":"measuredat","dose_given":"value","subject_id":"patientid"})
df_AF_medication.loc[df_AF_medication.value.isna(),"dose_given_unit"] = "Missing"
df_AF_medication.loc[:,"value"] = df_AF_medication.value.fillna("1")
df_AF_medication.loc[:,"value"] = str(df_AF_medication.value)
df_AF_medication = df_AF_medication[df_AF_medication.value.str.contains("\d+",regex=True)]
df_AF_medication.loc[:,"value"] = df_AF_medication.value.str.extract(r'(\d+)',expand=False)
df_AF_medication.value = df_AF_medication.value.astype(np.float32)
df_AF_medication = df_AF_medication[(df_AF_medication.medication!="Magnesium Sulfate") | ((df_AF_medication.medication=="Magnesium Sulfate")&(df_AF_medication.dose_given_unit.isin(["Missing","gm"])))]
df_AF_medication = df_AF_medication[(df_AF_medication.medication!="Propofol") | ((df_AF_medication.medication=="Propofol")&(df_AF_medication.dose_given_unit.isin(["Missing","mg"])))]
df_AF_medication = df_AF_medication[(df_AF_medication.medication!="Fentanyl Citrate") | ((df_AF_medication.medication=="Fentanyl Citrate")&(df_AF_medication.dose_given_unit.isin(["Missing","mcg"])))]
df_AF_medication = df_AF_medication[(df_AF_medication.medication!="Calcium Carbonate") | ((df_AF_medication.medication=="Calcium Carbonate")&(df_AF_medication.dose_given_unit.isin(["Missing","mg"])))]
df_AF_medication = df_AF_medication[(df_AF_medication.medication!="Calcium Gluconate") | ((df_AF_medication.medication=="Calcium Gluconate")&(~df_AF_medication.dose_given_unit.isin(["mg"])))]


#unit correction
#does not actually matter, as only "is given" will be used

df_AF_medication

In [None]:
medication_patients = df_AF_medication.merge(AF_admission_dataset,how="left",on="patientid")
medication_patients = medication_patients[(medication_patients.measuredat>=medication_patients.intime)&(medication_patients.measuredat<=medication_patients.outtime)]

#MIMIC specific preprocessing
medication_patients.loc[:,"measuredat"]=(pd.to_datetime(medication_patients.measuredat)-medication_patients.intime).dt.total_seconds() #INTIME is already in datetime
medication_patients["measuredat_min"] = np.floor(medication_patients.measuredat/60) #per minute
medication_patients = medication_patients.drop(columns=["intime","outtime"])

medication_patients["time_to_AF"]=(medication_patients.AF_measuredat.values-60*60*margin_time) - medication_patients.measuredat.values #add one margin_time to AF extra
medication_patients = medication_patients[(medication_patients.time_to_AF > (time_shift-margin_time)*60*60) & (medication_patients.time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) )]

medication_patients["itemid"] = medication_patients.medication

df_AF_medication = None #RAM Optimization

medication_patients_agg = medication_patients[["admissionid","itemid","value"]].groupby(["admissionid","itemid"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
medication_patients_agg.itemid = medication_patients_agg.itemid.astype(str)
medication_patients_agg.columns = ['_'.join(col).rstrip('_') for col in medication_patients_agg.columns.values]
medication_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in medication_patients_agg.columns.values]
medication_patients_agg = medication_patients_agg.pivot(index='admissionid', columns='itemid')
medication_patients_agg.columns = ['_'.join(col).rstrip('_') for col in medication_patients_agg.columns.values]

medication_patients_slope = medication_patients[["admissionid","itemid","measuredat_min","value"]].groupby(["admissionid","itemid"]).apply(lambda x:linreg_except(x,"value","measuredat_min")).reset_index()
medication_patients_slope.columns = [str(col) for col in medication_patients_slope.columns.values]
medication_patients_slope = medication_patients_slope.rename(columns={'0':"slope"})
medication_patients_slope.itemid = medication_patients_slope.itemid.astype(str)
medication_patients_slope = medication_patients_slope.pivot(index='admissionid', columns='itemid')
medication_patients_slope.columns = ['_'.join(col).rstrip('_') for col in medication_patients_slope.columns.values]
medication_patients_slope = medication_patients_slope.reset_index()

medication_patients_total_emar = medication_patients_agg.merge(medication_patients_slope,how='left',on='admissionid')
medication_patients = None #Save RAM
medication_patients_slope = None
medication_patients_agg = None

In [None]:
emar_features = ['admissionid', 'kurt_Calcium Acetate',
       'kurt_Calcium Carbonate', 'kurt_Calcium Carbonate Suspension',
       'kurt_Fentanyl Citrate',
     'kurt_Calcium Gluconate',
     'mean_Calcium Acetate',
       'mean_Calcium Carbonate', 'mean_Calcium Carbonate Suspension',
       'mean_Calcium Gluconate',  'mean_Fentanyl Citrate',
        'max_Calcium Acetate',
       'max_Calcium Carbonate', 'max_Calcium Carbonate Suspension',
       'max_Calcium Gluconate', 'max_Fentanyl Citrate',
        'min_Calcium Acetate',
       'min_Calcium Carbonate', 'min_Calcium Carbonate Suspension',
       'min_Calcium Gluconate',  'min_Fentanyl Citrate',
      'slope_Calcium Acetate',
       'slope_Calcium Carbonate', 'slope_Calcium Carbonate Suspension',
       'slope_Calcium Gluconate',  'slope_Fentanyl Citrate'
       ]

medication_patients_total = medication_patients_total_emar

In [None]:
medication_patients_total.describe()

## Feature extraction output features

In [None]:
output_af_features = pd.read_csv(mimic_extracted_path+"output_af_features.csv",sep=',')
output_af_features = output_af_features.rename(columns={"storetime":"measuredat","subject_id":"patientid","stay_id":"admissionid"})
output_af_features.measuredat = pd.to_datetime(output_af_features.measuredat)


df_d_items = pd.read_csv(mimic_base_path+"icu/csv/d_items.csv",sep=',')

output_af_features

In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    output_pd_patients = (output_af_features[output_af_features.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","date_corresponds_to_AF_admid","intime"]],how='left',on='admissionid')
else:
    output_pd_patients = (output_af_features[output_af_features.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","intime"]],how='left',on='admissionid')

#MIMIC specific preprocessing
output_pd_patients.loc[:,"measuredat"]=(pd.to_datetime(output_pd_patients.measuredat)-output_pd_patients.intime).dt.total_seconds() #INTIME is already in datetime
output_pd_patients["measuredat_min"] = np.floor(output_pd_patients.measuredat/60) #per minute
output_pd_patients = output_pd_patients.drop(columns=["intime"])

output_pd_patients["time_to_AF"]=(output_pd_patients.AF_measuredat.values-60*60*margin_time) - output_pd_patients.measuredat.values #add one margin_time to AF extra
output_pd_patients = output_pd_patients[(output_pd_patients.time_to_AF > (time_shift-margin_time)*60*60) & (output_pd_patients.time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) )]

for itemid_loop in output_af_features.itemid.unique():
    output_pd_patients.itemid = output_pd_patients.itemid.replace(itemid_loop,df_d_items[df_d_items.itemid==itemid_loop].label.values[0])

output_af_features = None #RAM Optimization

output_pd_patients_agg = output_pd_patients[["admissionid","itemid","value"]].groupby(["admissionid","itemid"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
output_pd_patients_agg.itemid = output_pd_patients_agg.itemid.astype(str)
output_pd_patients_agg.columns = ['_'.join(col).rstrip('_') for col in output_pd_patients_agg.columns.values]
output_pd_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in output_pd_patients_agg.columns.values]
output_pd_patients_agg = output_pd_patients_agg.pivot(index='admissionid', columns='itemid')
output_pd_patients_agg.columns = ['_'.join(col).rstrip('_') for col in output_pd_patients_agg.columns.values]

output_pd_patients_slope = output_pd_patients[["admissionid","itemid","measuredat_min","value"]].groupby(["admissionid","itemid"]).apply(lambda x:linreg_except(x,"value","measuredat_min")).reset_index()
output_pd_patients_slope.columns = [str(col) for col in output_pd_patients_slope.columns.values]
output_pd_patients_slope = output_pd_patients_slope.rename(columns={'0':"slope"})
output_pd_patients_slope.itemid = output_pd_patients_slope.itemid.astype(str)
output_pd_patients_slope = output_pd_patients_slope.pivot(index='admissionid', columns='itemid')
output_pd_patients_slope.columns = ['_'.join(col).rstrip('_') for col in output_pd_patients_slope.columns.values]
output_pd_patients_slope = output_pd_patients_slope.reset_index()

output_pd_patients_total = output_pd_patients_agg.merge(output_pd_patients_slope,how='left',on='admissionid')
output_pd_patients = None #Save RAM
output_pd_patients_slope = None
output_pd_patients_agg = None

## Fluid balance features

In [None]:
input_fluid_af_features = pd.read_csv(mimic_extracted_path+"input_fluid_af_features.csv",sep=',')
input_fluid_af_features = input_fluid_af_features.rename(columns={"storetime":"measuredat","subject_id":"patientid","stay_id":"admissionid","amount":"fluidin"})
input_fluid_af_features.starttime = pd.to_datetime(input_fluid_af_features.starttime)
input_fluid_af_features.endtime = pd.to_datetime(input_fluid_af_features.endtime)

outputevents = pd.read_csv(mimic_base_path+"icu/csv/outputevents.csv",sep=',')
outputevents = outputevents.rename(columns={"storetime":"measuredat","subject_id":"patientid","stay_id":"admissionid","value":"fluidout"})
outputevents.measuredat = pd.to_datetime(outputevents.measuredat)


In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    input_fluid_pd_patients = (input_fluid_af_features[input_fluid_af_features.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","date_corresponds_to_AF_admid","intime"]],how='left',on='admissionid')
else:
    input_fluid_pd_patients = (input_fluid_af_features[input_fluid_af_features.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","intime"]],how='left',on='admissionid')

#MIMIC specific preprocessing
input_fluid_pd_patients.loc[:,"starttime"]=(pd.to_datetime(input_fluid_pd_patients.starttime)-input_fluid_pd_patients.intime).dt.total_seconds() #INTIME is already in datetime
input_fluid_pd_patients.loc[:,"endtime"]=(pd.to_datetime(input_fluid_pd_patients.endtime)-input_fluid_pd_patients.intime).dt.total_seconds() #INTIME is already in datetime
input_fluid_pd_patients = input_fluid_pd_patients.drop(columns=["intime"])

input_fluid_pd_patients["start_time_to_AF"]=(input_fluid_pd_patients.AF_measuredat.values-60*60*margin_time) - input_fluid_pd_patients.starttime.values #add one margin_time to AF extra
input_fluid_pd_patients["stop_time_to_AF"]=(input_fluid_pd_patients.AF_measuredat.values-60*60*margin_time) - input_fluid_pd_patients.endtime.values #add one margin_time to AF extra

input_fluid_pd_patients = input_fluid_pd_patients[((input_fluid_pd_patients.start_time_to_AF > (time_shift-margin_time)*60*60) & (input_fluid_pd_patients.start_time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) ))|
                            ((input_fluid_pd_patients.stop_time_to_AF > (time_shift-margin_time)*60*60) & (input_fluid_pd_patients.stop_time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) ))]

output_af_features = None #RAM Optimization

input_fluid_pd_patients_agg = input_fluid_pd_patients[["admissionid","fluidin"]].groupby(["admissionid"]).sum().reset_index()

input_fluid_pd_patients_total = input_fluid_pd_patients_agg
input_fluid_pd_patients_agg = None #Save RAM
input_fluid_pd_patients = None

#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    output_fluid_pd_patients = (outputevents[outputevents.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","date_corresponds_to_AF_admid","intime"]],how='left',on='admissionid')
else:
    output_fluid_pd_patients = (outputevents[outputevents.admissionid.isin(AF_admission_dataset.admissionid)]).merge(AF_admission_dataset[["admissionid","AF_measuredat","intime"]],how='left',on='admissionid')

#MIMIC specific preprocessing
output_fluid_pd_patients.loc[:,"measuredat"]=(pd.to_datetime(output_fluid_pd_patients.measuredat)-output_fluid_pd_patients.intime).dt.total_seconds() #INTIME is already in datetime
output_fluid_pd_patients = output_fluid_pd_patients.drop(columns=["intime"])

output_fluid_pd_patients["time_to_AF"]=(output_fluid_pd_patients.AF_measuredat.values-60*60*margin_time) - output_fluid_pd_patients.measuredat.values #add one margin_time to AF extra
output_fluid_pd_patients = output_fluid_pd_patients[(output_fluid_pd_patients.time_to_AF > (time_shift-margin_time)*60*60) & (output_fluid_pd_patients.time_to_AF/60/60 <= (time_shift+hours_to_first_AF-margin_time-1) )]

output_af_features = None #RAM Optimization

output_fluid_pd_patients_agg = output_fluid_pd_patients[["admissionid","fluidout"]].groupby(["admissionid"]).sum().reset_index()

output_fluid_pd_patients_total = output_fluid_pd_patients_agg
output_fluid_pd_patients = None #Save RAM
output_fluid_pd_patients_agg = None

fluid_total = output_fluid_pd_patients_total.merge(input_fluid_pd_patients_total,on="admissionid",how="outer")
fluid_total["fluid_balance"]=fluid_total.fluidin-fluid_total.fluidout

## Full dataset merging

In [None]:
AF_dataset = AF_admission_dataset.merge(numeric_pd_patients_total,how='left',on='admissionid')
AF_dataset = AF_dataset.merge(lab_items_patients_total,how='left',on='admissionid')
AF_dataset = AF_dataset.merge(medication_patients_total,how='left',on='admissionid')
AF_dataset = AF_dataset.merge(output_pd_patients_total,how='left',on='admissionid')
AF_dataset = AF_dataset.merge(fluid_total,how='left',on='admissionid')

In [None]:
AF_dataset["Weight"] = AF_dataset["mean_Admission Weight (Kg)"].values
AF_dataset["Weight"] = AF_dataset["Weight"].fillna(AF_dataset["mean_Daily Weight"])

AF_dataset["Height"] = AF_dataset["mean_Height"].values

In [None]:
if non_biased_model:
    if not six_hour_model:
        if one_half_hour_model:
            AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_1_5_hours_no_af_distribution_matched.csv",index=False)
        else:
            AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_12_hours_no_af_distribution_matched.csv",index=False)
    else:
        AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_"+str(time_shift)+"_hours_no_af_distribution_matched.csv",index=False)
else:
    if not six_hour_model:
        if one_half_hour_model:
            AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_1_5_hours.csv",index=False)
        else:
            AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_12_hours.csv",index=False)
    else:
        AF_dataset.to_csv(mimic_extracted_path+"AF_dataset_"+str(time_shift)+"_hours.csv",index=False)