In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import tqdm
import os
import numpy as np
import re
from scipy.stats import linregress

In [None]:
data_path = "./Data/312_Validatie_VKF_Predictiemodellen/rep312_"

# Prepatory work

In [None]:
execute_bool = False
if execute_bool:
    ids = []
    names = []
    for chunk in tqdm.tqdm(pd.read_csv(data_path+"mondata_validated.csv",sep=";", encoding = "ISO-8859-1",chunksize=10000000,iterator=True,low_memory=False),ascii=True):

        for itemid in chunk.VariableID.unique():
            path = "./Data/312_Validatie_VKF_Predictiemodellen/rep312_mondata_validated_vars/"+str(itemid)+".csv"

            if os.path.exists(path):
                chunk[chunk.VariableID==itemid].to_csv(path,mode='a',index=False, header=False)
            else:
                chunk[chunk.VariableID==itemid].to_csv(path,index=False)

            ids.append(itemid)
            names.append(chunk[chunk.VariableID==itemid].Var_Abbr.unique()[0])
            
    temp_df = pd.DataFrame({"id":ids,"Var_Abbr":names})
    temp_df = temp_df.drop_duplicates("id")
    temp_df.to_csv("./Data/312_Validatie_VKF_Predictiemodellen/rep312_mondata_validated_vars/dictionary.csv",index=False)
            

In [None]:
dict_df = pd.read_csv("./Data/312_Validatie_VKF_Predictiemodellen/rep312_mondata_validated_vars/dictionary.csv")

# Patient and time-independent variables aggregation

## patients + extra

In [None]:
cohort_df = pd.read_csv(data_path+"cohort.csv",sep=";", encoding = "ISO-8859-1")
cohort_df.loc[cohort_df.Length==-1,"Length"] = np.NaN
cohort_df

In [None]:
Extr_OpnameGegevens_df = pd.read_csv(data_path+"Extr_OpnameGegevens.csv",sep=";", encoding = "ISO-8859-1")
Extr_OpnameGegevens_df = Extr_OpnameGegevens_df[Extr_OpnameGegevens_df.variable=="sepsis opnam"]
Extr_OpnameGegevens_df["sepsis_bool_extra_opn"] = 0
Extr_OpnameGegevens_df.loc[Extr_OpnameGegevens_df.code==1,"sepsis_bool_extra_opn"] = 1
Extr_OpnameGegevens_df = Extr_OpnameGegevens_df.rename(columns={'mins_since_admission':'sepsis_extra_opn_measuredat'})
Extr_OpnameGegevens_df = Extr_OpnameGegevens_df[['ICUSessionID','sepsis_extra_opn_measuredat', 'sepsis_bool_extra_opn']]
Extr_OpnameGegevens_df = Extr_OpnameGegevens_df.drop_duplicates("ICUSessionID",keep="last")

In [None]:
AF_dataset_df = cohort_df.merge(Extr_OpnameGegevens_df,how="left",on="ICUSessionID")
AF_dataset_df = AF_dataset_df[['ICUSessionID', 'HospAdmissionID', 'ICUAdmissionSequence','AgeOnAdmission', 'ICUAdmissionTime','icu_los','icuurg','Length','Weight', 'bmi', 'bsa', 'sepsis_extra_opn_measuredat','sepsis_bool_extra_opn']]
AF_dataset_df.ICUAdmissionTime = pd.to_datetime(AF_dataset_df.ICUAdmissionTime)

AF_dataset_df

## observrec

In [None]:
observrec_df = pd.read_csv(data_path+"observrec.csv",sep=";", encoding = "ISO-8859-1")
observrec_df

In [None]:
af_icusession_id_df = observrec_df[observrec_df.stringvalue=="VKF"].drop_duplicates("icusessionid",keep="first")
af_icusession_id_df = af_icusession_id_df.rename(columns={"mins_since_admission":"AF_measuredat","icusessionid":"ICUSessionID"})
af_icusession_id_df["AF"] = 1
af_icusession_id_df = af_icusession_id_df[["ICUSessionID","AF_measuredat","AF"]]
af_icusession_id_df

AF_dataset_df = AF_dataset_df.merge(af_icusession_id_df,how="left",on="ICUSessionID")
AF_dataset_df.loc[AF_dataset_df.AF.isna(),"AF"]=0

AF_dataset_df.loc[AF_dataset_df.AF_measuredat.isna(),"AF_measuredat"]=0

AF_dataset_df

## Comorb

In [None]:
comorb_df = pd.read_csv(data_path+"comorb.csv", encoding = "ISO-8859-1",sep=";")
comorb_df.loc[comorb_df.Diagnose.isna(),"Diagnose"]=""
comorb_df

In [None]:
ids_sepsis = comorb_df[comorb_df.Diagnose.str.contains("sepsis",flags=re.IGNORECASE)].ICUSessionID.unique()
comorb_df[comorb_df.Diagnose.str.contains("sepsis",flags=re.IGNORECASE)].sort_values("ICUSessionID")

In [None]:
AF_dataset_df["sepsis_bool_comorb_source"]=0
AF_dataset_df.loc[AF_dataset_df.ICUSessionID.isin(ids_sepsis),"sepsis_bool_comorb_source"]=1
AF_dataset_df

## apache

In [None]:
apache_2_df = pd.read_csv(data_path+"apache2.csv",sep=";", encoding = "ISO-8859-1")
apache_2_df

In [None]:
AF_dataset_df = AF_dataset_df.merge(apache_2_df,how="left",on="ICUSessionID")
AF_dataset_df

In [None]:
apache_4_df = pd.read_csv(data_path+"apache4.csv",sep=";", encoding = "ISO-8859-1")
apache_4_df

In [None]:
AF_dataset_df = AF_dataset_df.merge(apache_4_df[["ICUSessionID","ap4score"]],how="left",on="ICUSessionID")
AF_dataset_df

## sec diagnosises

In [None]:
SecDiagnoses_df = pd.read_csv(data_path+"SecDiagnoses.csv",sep=";", encoding = "ISO-8859-1")
SecDiagnoses_df

# Preparing full dataset with preprocessing on time

## Preparing

In [None]:
one_half_hour_model = False
six_hour_model = True
non_biased_model = True #Match the NO AF measurement point distribution to the AF patients to avoid time-dependent treatment bias

if one_half_hour_model:
    time_shift = 1.5*60
    margin_time = 1.5*60
else:
    time_shift = 12*60
    margin_time = 0*60

if six_hour_model:
    time_shift = 6*60
    margin_time = 0*60
    
hours_to_first_AF = 12*60
total_window = (hours_to_first_AF+time_shift)#*60 #in minutes
total_window/60

to_hour_multiplier = 1

In [None]:
only_data_of_first_hours = False #Use only data of the first X hours of admission
include_AF_patients_without_AF_at_sample = True #Include moments in time of AF patients before the AF diagnosis in the dataset.

np.random.seed(42)

#exclusion, minimum 2014
AF_dataset_df = AF_dataset_df[AF_dataset_df.ICUAdmissionTime>pd.to_datetime("2014")]

AF_dataset_df = AF_dataset_df[(AF_dataset_df.icu_los>=total_window)].copy(deep=True)

AF_dataset_df = AF_dataset_df.sample(len(AF_dataset_df),random_state=42)
AF_dataset_df = AF_dataset_df[(AF_dataset_df.AF==0)|(AF_dataset_df.AF_measuredat>=to_hour_multiplier*total_window)]

if non_biased_model:
    AF_measuredat_sample_df = AF_dataset_df[(AF_dataset_df.AF==1)&(AF_dataset_df.AF_measuredat>(total_window))][["ICUSessionID","AF_measuredat"]].copy(deep=True)
    AF_dataset_df["date_corresponds_to_AF_admid"] = AF_dataset_df.ICUSessionID
    for ICUSessionID in AF_dataset_df[AF_dataset_df.AF==0].ICUSessionID.values:
        if len(AF_measuredat_sample_df) == 0:
            break
        else:
            if len(AF_measuredat_sample_df[AF_measuredat_sample_df.AF_measuredat<=((AF_dataset_df[AF_dataset_df.ICUSessionID==ICUSessionID]['icu_los'].values[0]))])>0:
                random_state_admission = np.random.RandomState(ICUSessionID)
                choice = random_state_admission.choice(AF_measuredat_sample_df[AF_measuredat_sample_df.AF_measuredat<=((AF_dataset_df[AF_dataset_df.ICUSessionID==ICUSessionID]['icu_los'].values[0]+0.1))]['ICUSessionID'].values)
                AF_dataset_df.loc[AF_dataset_df.ICUSessionID==ICUSessionID,"AF_measuredat"] = AF_measuredat_sample_df[AF_measuredat_sample_df.ICUSessionID==choice].AF_measuredat.values[0]
                AF_dataset_df.loc[AF_dataset_df.ICUSessionID==ICUSessionID,"date_corresponds_to_AF_admid"] = AF_measuredat_sample_df[AF_measuredat_sample_df.ICUSessionID==choice].ICUSessionID.values[0]
                AF_measuredat_sample_df = AF_measuredat_sample_df[AF_measuredat_sample_df.ICUSessionID!=choice]   
    AF_dataset_df.loc[:,"AF_measuredat"] = AF_dataset_df.apply(lambda row: np.random.randint(total_window,row['icu_los']+1) if (row['AF']==0) and ((pd.isnull(row["AF_measuredat"])|(row["AF_measuredat"]==0))) else row['AF_measuredat'],axis=1).values #the no AF patients should have a timesample to "measure" AF.                     
else:
    AF_dataset_df.loc[:,"AF_measuredat"] = AF_dataset_df.apply(lambda row: np.random.randint(total_window,row['icu_los']+1) if (row['AF']==0) and ((pd.isnull(row["AF_measuredat"])|(row["AF_measuredat"]==0))) else row['AF_measuredat'],axis=1).values #the no AF patients should have a timesample to "measure" AF.                     


AF_dataset_df["AF_orig"]=AF_dataset_df.AF
AF_admission_dataset = AF_dataset_df[AF_dataset_df.AF_measuredat>=to_hour_multiplier*total_window]#pd.concat([first_AFs_pd_timed,temp_pd]).reset_index(drop=True) #AF_dataset

if only_data_of_first_hours:
    AF_admission_dataset["AF_measuredat"] = total_window #only take data of the first 48 hours
    
first_AFs_pd_timed = None
temp_pd = None

## labdata

In [None]:
labdata_df = pd.read_csv(data_path+"labdata.csv",sep=";", encoding = "ISO-8859-1")
labdata_df = labdata_df.rename(columns={"mins_since_icu_adm":"measuredat","VariableValue":"value"})
labdata_df

In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    labdata_df_patients = (labdata_df[labdata_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    labdata_df_patients = (labdata_df[labdata_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')

labdata_df_patients["time_to_AF"]=(labdata_df_patients.AF_measuredat.values-margin_time) - labdata_df_patients.measuredat.values #add one margin_time to AF extra
labdata_df_patients = labdata_df_patients[(labdata_df_patients.time_to_AF > (time_shift-margin_time)) & (labdata_df_patients.time_to_AF <= (time_shift+hours_to_first_AF-margin_time) )]

for VariableID_loop in labdata_df.VariableID.unique():
    labdata_df_patients.VariableID = labdata_df_patients.VariableID.replace(VariableID_loop,labdata_df[labdata_df.VariableID==VariableID_loop].Var_Abbr.values[0])

labdata_df = None #RAM Optimization

labdata_df_patients_agg = labdata_df_patients[["ICUSessionID","VariableID","value"]].groupby(["ICUSessionID","VariableID"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
labdata_df_patients_agg.VariableID = labdata_df_patients_agg.VariableID.astype(str)
labdata_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in labdata_df_patients_agg.columns.values]
labdata_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in labdata_df_patients_agg.columns.values]
labdata_df_patients_agg = labdata_df_patients_agg.pivot(index='ICUSessionID', columns='VariableID')
labdata_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in labdata_df_patients_agg.columns.values]

labdata_df_patients_slope = labdata_df_patients[["ICUSessionID","VariableID","measuredat","value"]].groupby(["ICUSessionID","VariableID"]).apply(lambda x:linregress(x.measuredat,x.value)[0]).reset_index()
labdata_df_patients_slope.columns = [str(col) for col in labdata_df_patients_slope.columns.values]
labdata_df_patients_slope = labdata_df_patients_slope.rename(columns={'0':"slope"})
labdata_df_patients_slope.VariableID = labdata_df_patients_slope.VariableID.astype(str)
labdata_df_patients_slope = labdata_df_patients_slope.pivot(index='ICUSessionID', columns='VariableID')
labdata_df_patients_slope.columns = ['_'.join(col).rstrip('_') for col in labdata_df_patients_slope.columns.values]
labdata_df_patients_slope = labdata_df_patients_slope.reset_index()

labdata_df_patients_total = labdata_df_patients_agg.merge(labdata_df_patients_slope,how='left',on='ICUSessionID')
labdata_df_patients = None #Save RAM
labdata_df_patients_slope = None
labdata_df_patients_agg = None

## diagnosisprocedures

In [None]:
diagnosis_and_procedures_df = pd.read_csv(data_path+"diagnosis_and_procedures.csv",sep=";", encoding = "ISO-8859-1")
diagnosis_and_procedures_df = diagnosis_and_procedures_df.rename(columns={"start_mins_since_icu_adm":"start_measuredat",
                                                                          "end_mins_since_icu_adm":"end_measuredat"})
diagnosis_and_procedures_df["value"]=1
diagnosis_and_procedures_df


In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    diagnosis_and_procedures_df_patients = (diagnosis_and_procedures_df[diagnosis_and_procedures_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    diagnosis_and_procedures_df_patients = (diagnosis_and_procedures_df[diagnosis_and_procedures_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')
    
diagnosis_and_procedures_df_patients["start_time_to_AF"]=(diagnosis_and_procedures_df_patients.AF_measuredat.values-margin_time) - diagnosis_and_procedures_df_patients.start_measuredat.values #add one margin_time to AF extra
diagnosis_and_procedures_df_patients["stop_time_to_AF"]=(diagnosis_and_procedures_df_patients.AF_measuredat.values-margin_time) - diagnosis_and_procedures_df_patients.end_measuredat.values #add one margin_time to AF extra

diagnosis_and_procedures_df_patients = diagnosis_and_procedures_df_patients[((diagnosis_and_procedures_df_patients.start_time_to_AF > (time_shift-margin_time)) & (diagnosis_and_procedures_df_patients.start_time_to_AF <= (time_shift+hours_to_first_AF-margin_time) ))|
                            ((diagnosis_and_procedures_df_patients.stop_time_to_AF > (time_shift-margin_time)) & (diagnosis_and_procedures_df_patients.stop_time_to_AF <= (time_shift+hours_to_first_AF-margin_time) ))]

for VariableID_loop in diagnosis_and_procedures_df.VariableID.unique():
    diagnosis_and_procedures_df_patients.VariableID = diagnosis_and_procedures_df_patients.VariableID.replace(VariableID_loop,diagnosis_and_procedures_df[diagnosis_and_procedures_df.VariableID==VariableID_loop].Var_Abbr.values[0])

diagnosis_and_procedures_df = None #RAM Optimization

diagnosis_and_procedures_df_patients_agg = diagnosis_and_procedures_df_patients[["ICUSessionID","VariableID","value"]].groupby(["ICUSessionID","VariableID"]).agg({'mean'}).reset_index()
diagnosis_and_procedures_df_patients_agg.VariableID = diagnosis_and_procedures_df_patients_agg.VariableID.astype(str)
diagnosis_and_procedures_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in diagnosis_and_procedures_df_patients_agg.columns.values]
diagnosis_and_procedures_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in diagnosis_and_procedures_df_patients_agg.columns.values]
diagnosis_and_procedures_df_patients_agg = diagnosis_and_procedures_df_patients_agg.pivot(index='ICUSessionID', columns='VariableID')
diagnosis_and_procedures_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in diagnosis_and_procedures_df_patients_agg.columns.values]

diagnosis_and_procedures_df_patients_total = diagnosis_and_procedures_df_patients_agg
diagnosis_and_procedures_df_patients = None #Save RAM
diagnosis_and_procedures_df_patients_slope = None
diagnosis_and_procedures_df_patients_agg = None

## medtreatment

In [None]:
medtreatment_df = pd.read_csv(data_path+"medtreatment.csv",sep=";", encoding = "ISO-8859-1")
medtreatment_df = medtreatment_df.rename(columns={"start_mins_since_icu_adm":"start_measuredat","Dose":"value",
                                                                          "end_mins_since_icu_adm":"end_measuredat"})
medtreatment_df = medtreatment_df[~medtreatment_df.value.str.contains(",")]
medtreatment_df["value"] = pd.to_numeric(medtreatment_df["value"])
medtreatment_df

In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    medtreatment_df_patients = (medtreatment_df[medtreatment_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    medtreatment_df_patients = (medtreatment_df[medtreatment_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')
    
medtreatment_df_patients["start_time_to_AF"]=(medtreatment_df_patients.AF_measuredat.values-margin_time) - medtreatment_df_patients.start_measuredat.values #add one margin_time to AF extra
medtreatment_df_patients["stop_time_to_AF"]=(medtreatment_df_patients.AF_measuredat.values-margin_time) - medtreatment_df_patients.end_measuredat.values #add one margin_time to AF extra

medtreatment_df_patients = medtreatment_df_patients[((medtreatment_df_patients.start_time_to_AF > (time_shift-margin_time)) & (medtreatment_df_patients.start_time_to_AF <= (time_shift+hours_to_first_AF-margin_time) ))|
                            ((medtreatment_df_patients.stop_time_to_AF > (time_shift-margin_time)) & (medtreatment_df_patients.stop_time_to_AF <= (time_shift+hours_to_first_AF-margin_time) ))]

for VariableID_loop in medtreatment_df.VariableID.unique():
    medtreatment_df_patients.VariableID = medtreatment_df_patients.VariableID.replace(VariableID_loop,medtreatment_df[medtreatment_df.VariableID==VariableID_loop].Var_Abbr.values[0])

medtreatment_df = None #RAM Optimization

medtreatment_df_patients_agg = medtreatment_df_patients[["ICUSessionID","VariableID","value"]].groupby(["ICUSessionID","VariableID"]).agg({'mean'}).reset_index()
medtreatment_df_patients_agg.VariableID = medtreatment_df_patients_agg.VariableID.astype(str)
medtreatment_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in medtreatment_df_patients_agg.columns.values]
medtreatment_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in medtreatment_df_patients_agg.columns.values]
medtreatment_df_patients_agg = medtreatment_df_patients_agg.pivot(index='ICUSessionID', columns='VariableID')
medtreatment_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in medtreatment_df_patients_agg.columns.values]

medtreatment_df_patients_total = medtreatment_df_patients_agg
medtreatment_df_patients = None #Save RAM
medtreatment_df_patients_slope = None
medtreatment_df_patients_agg = None

## izis dervals

In [None]:
izisdervals_df = pd.read_csv(data_path+"izisdervals.csv",sep=";", encoding = "ISO-8859-1")
izisdervals_df.varvalue = pd.to_numeric(izisdervals_df.varvalue)
izisdervals_df = izisdervals_df.rename(columns={"icusessionid":"ICUSessionID","mins_since_admission":"measuredat","varvalue":"value","variablename":"Var_Abbr"})
izisdervals_df

In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    izisdervals_df_patients = (izisdervals_df[izisdervals_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    izisdervals_df_patients = (izisdervals_df[izisdervals_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')

izisdervals_df_patients["time_to_AF"]=(izisdervals_df_patients.AF_measuredat.values-margin_time) - izisdervals_df_patients.measuredat.values #add one margin_time to AF extra
izisdervals_df_patients = izisdervals_df_patients[(izisdervals_df_patients.time_to_AF > (time_shift-margin_time)) & (izisdervals_df_patients.time_to_AF <= (time_shift+hours_to_first_AF-margin_time) )]

for VariableID_loop in izisdervals_df.Var_Abbr.unique():
    izisdervals_df_patients.Var_Abbr = izisdervals_df_patients.Var_Abbr.replace(VariableID_loop,izisdervals_df[izisdervals_df.Var_Abbr==VariableID_loop].Var_Abbr.values[0])

izisdervals_df = None #RAM Optimization

izisdervals_df_patients_agg = izisdervals_df_patients[["ICUSessionID","Var_Abbr","value"]].groupby(["ICUSessionID","Var_Abbr"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
izisdervals_df_patients_agg.Var_Abbr = izisdervals_df_patients_agg.Var_Abbr.astype(str)
izisdervals_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in izisdervals_df_patients_agg.columns.values]
izisdervals_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in izisdervals_df_patients_agg.columns.values]
izisdervals_df_patients_agg = izisdervals_df_patients_agg.pivot(index='ICUSessionID', columns='Var_Abbr')
izisdervals_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in izisdervals_df_patients_agg.columns.values]

izisdervals_df_patients_total = izisdervals_df_patients_agg
izisdervals_df_patients = None #Save RAM
izisdervals_df_patients_slope = None
izisdervals_df_patients_agg = None

## izis monvals

In [None]:
izismonvals_df = pd.read_csv(data_path+"izismonvals.csv",sep=";", encoding = "ISO-8859-1")
izismonvals_df = izismonvals_df[~izismonvals_df.varvalue.str.contains(",")]
izismonvals_df.varvalue = pd.to_numeric(izismonvals_df.varvalue)
izismonvals_df = izismonvals_df.rename(columns={"icusessionid":"ICUSessionID","mins_since_admission":"measuredat","varvalue":"value","variablename":"Var_Abbr"})
izismonvals_df

In [None]:
#NUMERIC ITEMS PREPROCESSING, this takes around 2 minutes
if non_biased_model:
    izismonvals_df_patients = (izismonvals_df[izismonvals_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    izismonvals_df_patients = (izismonvals_df[izismonvals_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')

izismonvals_df_patients["time_to_AF"]=(izismonvals_df_patients.AF_measuredat.values-margin_time) - izismonvals_df_patients.measuredat.values #add one margin_time to AF extra
izismonvals_df_patients = izismonvals_df_patients[(izismonvals_df_patients.time_to_AF > (time_shift-margin_time)) & (izismonvals_df_patients.time_to_AF <= (time_shift+hours_to_first_AF-margin_time) )]

for VariableID_loop in izismonvals_df.Var_Abbr.unique():
    izismonvals_df_patients.Var_Abbr = izismonvals_df_patients.Var_Abbr.replace(VariableID_loop,izismonvals_df[izismonvals_df.Var_Abbr==VariableID_loop].Var_Abbr.values[0])

izismonvals_df = None #RAM Optimization

izismonvals_df_patients_agg = izismonvals_df_patients[["ICUSessionID","Var_Abbr","value"]].groupby(["ICUSessionID","Var_Abbr"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
izismonvals_df_patients_agg.Var_Abbr = izismonvals_df_patients_agg.Var_Abbr.astype(str)
izismonvals_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in izismonvals_df_patients_agg.columns.values]
izismonvals_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in izismonvals_df_patients_agg.columns.values]
izismonvals_df_patients_agg = izismonvals_df_patients_agg.pivot(index='ICUSessionID', columns='Var_Abbr')
izismonvals_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in izismonvals_df_patients_agg.columns.values]

izismonvals_df_patients_total = izismonvals_df_patients_agg
izismonvals_df_patients = None #Save RAM
izismonvals_df_patients_slope = None
izismonvals_df_patients_agg = None

## mondata val

In [None]:
mondata_validated_df = pd.read_csv(data_path+"mondata_validated.csv",sep=";", encoding = "ISO-8859-1")
mondata_validated_df = mondata_validated_df.rename(columns={"mins_since_icu_adm":"measuredat","VariableValue":"value"})

In [None]:
mondata_validated_df.Var_Abbr.unique()

In [None]:
#NUMERIC ITEMS PREPROCESSING
if non_biased_model:
    mondata_validated_df_patients = (mondata_validated_df[mondata_validated_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid"]],how='left',on='ICUSessionID')
else:
    mondata_validated_df_patients = (mondata_validated_df[mondata_validated_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat"]],how='left',on='ICUSessionID')

mondata_validated_df_patients["time_to_AF"]=(mondata_validated_df_patients.AF_measuredat.values-margin_time) - mondata_validated_df_patients.measuredat.values #add one margin_time to AF extra
mondata_validated_df_patients = mondata_validated_df_patients[(mondata_validated_df_patients.time_to_AF > (time_shift-margin_time)) & (mondata_validated_df_patients.time_to_AF <= (time_shift+hours_to_first_AF-margin_time) )]

for VariableID_loop in mondata_validated_df.VariableID.unique():
    mondata_validated_df_patients.VariableID = mondata_validated_df_patients.VariableID.replace(VariableID_loop,mondata_validated_df[mondata_validated_df.VariableID==VariableID_loop].Var_Abbr.values[0])

mondata_validated_df = None #RAM Optimization

mondata_validated_df_patients_agg = mondata_validated_df_patients[["ICUSessionID","VariableID","value"]].groupby(["ICUSessionID","VariableID"]).agg({'mean','min','max',pd.DataFrame.kurt}).reset_index()
mondata_validated_df_patients_agg.VariableID = mondata_validated_df_patients_agg.VariableID.astype(str)
mondata_validated_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in mondata_validated_df_patients_agg.columns.values]
mondata_validated_df_patients_agg.columns = [col.replace('value_','') if 'value_' in col else col for col in mondata_validated_df_patients_agg.columns.values]
mondata_validated_df_patients_agg = mondata_validated_df_patients_agg.pivot(index='ICUSessionID', columns='VariableID')
mondata_validated_df_patients_agg.columns = ['_'.join(col).rstrip('_') for col in mondata_validated_df_patients_agg.columns.values]

def linreg_except(x,value,measuredat):
    try:
        return linregress(x[measuredat],x[value])[0]
    except:
        return np.nan

mondata_validated_df_patients_slope = mondata_validated_df_patients[["ICUSessionID","VariableID","measuredat","value"]].groupby(["ICUSessionID","VariableID"]).apply(lambda x:linreg_except(x,"value","measuredat")).reset_index()
mondata_validated_df_patients_slope.columns = [str(col) for col in mondata_validated_df_patients_slope.columns.values]
mondata_validated_df_patients_slope = mondata_validated_df_patients_slope.rename(columns={'0':"slope"})
mondata_validated_df_patients_slope.VariableID = mondata_validated_df_patients_slope.VariableID.astype(str)
mondata_validated_df_patients_slope = mondata_validated_df_patients_slope.pivot(index='ICUSessionID', columns='VariableID')
mondata_validated_df_patients_slope.columns = ['_'.join(col).rstrip('_') for col in mondata_validated_df_patients_slope.columns.values]
mondata_validated_df_patients_slope = mondata_validated_df_patients_slope.reset_index()

mondata_validated_df_patients_total = mondata_validated_df_patients_agg.merge(mondata_validated_df_patients_slope,how='left',on='ICUSessionID')
mondata_validated_df_patients = None #Save RAM
mondata_validated_df_patients_slope = None
mondata_validated_df_patients_agg = None

## scores

In [None]:
sofa_df = pd.read_csv(data_path+"sofa_recalc.csv",sep=";", encoding = "ISO-8859-1")
sofa_df = sofa_df.fillna(0)
sofa_df = sofa_df.rename(columns={"icusessionid":"ICUSessionID"})
sofa_df["sofa"] = sofa_df.resp+sofa_df.coag+sofa_df.liver+sofa_df.cardio+sofa_df.cns+sofa_df.renal
sofa_df

In [None]:
#NUMERIC ITEMS PREPROCESSING
if non_biased_model:
    sofa_df_patients = (sofa_df[sofa_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","date_corresponds_to_AF_admid","ICUAdmissionTime"]],how='left',on='ICUSessionID')
else:
    sofa_df_patients = (sofa_df[sofa_df.ICUSessionID.isin(AF_admission_dataset.ICUSessionID)]).merge(AF_admission_dataset[["ICUSessionID","AF_measuredat","ICUAdmissionTime"]],how='left',on='ICUSessionID')

sofa_df_patients["AF_admissionday"] = ((sofa_df_patients.ICUAdmissionTime+pd.to_timedelta(sofa_df_patients.AF_measuredat,"m")).dt.date - sofa_df_patients.ICUAdmissionTime.dt.date).dt.days
sofa_df_total = sofa_df_patients[sofa_df_patients.AF_admissionday == sofa_df_patients.AdmissionDay][["ICUSessionID","sofa"]]

## Merging them all

In [None]:
AF_full_dataset = AF_admission_dataset.merge(sofa_df_total,how='left',on='ICUSessionID').merge(
    labdata_df_patients_total,how='left',on='ICUSessionID').merge(
    mondata_validated_df_patients_total,how='left',on='ICUSessionID').merge(
    izismonvals_df_patients_total,how='left',on='ICUSessionID').merge(
    izisdervals_df_patients_total,how='left',on='ICUSessionID').merge(
    medtreatment_df_patients_total,how='left',on='ICUSessionID').merge(
    diagnosis_and_procedures_df_patients_total,how='left',on='ICUSessionID')

In [None]:
AF_full_dataset

In [None]:
data_save_path = "./Data/312_Validatie_VKF_Predictiemodellen/"

if non_biased_model:
    if not six_hour_model:
        if one_half_hour_model:
            AF_full_dataset.to_csv(data_save_path+"AF_dataset_1_5_hours_no_af_distribution_matched.csv",index=False)
        else:
            AF_full_dataset.to_csv(data_save_path+"AF_dataset_12_hours_no_af_distribution_matched.csv",index=False)
    else:
        AF_full_dataset.to_csv(data_save_path+"AF_dataset_"+str(int(time_shift/60))+"_hours_no_af_distribution_matched.csv",index=False)
else:
    if not six_hour_model:
        if one_half_hour_model:
            AF_full_dataset.to_csv(data_save_path+"AF_dataset_1_5_hours.csv",index=False)
        else:
            AF_full_dataset.to_csv(data_save_path+"AF_dataset_12_hours.csv",index=False)
    else:
        AF_full_dataset.to_csv(data_save_path+"AF_dataset_"+str(int(time_shift/60))+"_hours.csv",index=False)