In [18]:
# Imports
import matplotlib.pyplot as plt
from tqdm import tqdm
plt.rcParams["axes.grid"] = False #disable ugly white lines which are present in google colab for matplotlib
import numpy as np
import datetime
today = datetime.datetime.today() #To work with datetime values. Only relative time matters in this project, so selecting a random date is not a problem. 
from sklearn.metrics import classification_report,auc,r2_score,matthews_corrcoef
import shap
from catboost import CatBoostClassifier,CatBoostRegressor,Pool
from catboost.utils import get_roc_curve
import pandas as pd
import re
import os
from scipy.stats import linregress,ttest_ind,ranksums
mimic_extracted_path = "../data/mimic/"
amsterdam_data_path = "../data/amsterdam/"
mimic_base_path = "../../../MIMIC_IV/mimic-iv-1.0/"
from tabulate import tabulate

In [8]:
pd.set_option('display.max_columns', None)

In [9]:
present = False
if present:
    params = {"ytick.color" : "w",
            "xtick.color" : "w",
            "axes.titlecolor" : "w",
            "axes.labelcolor" : "w",
            "axes.edgecolor" : "w"}
    plt.rcParams.update(params)
    plt.rcParams.update({'font.size': 18})
else:
    params = {"ytick.color" : "black",
            "xtick.color" : "black",
            "axes.titlecolor" : "black",
            "axes.labelcolor" : "black",
            "axes.edgecolor" : "black"}
    plt.rcParams.update(params)

# External Validation

In [10]:
def pandas_result_AF_maker(patient_group_string,X,y,model,append_df):
    (fpr, tpr, thresholds) = get_roc_curve(model, Pool(data=X,label=y), plot=False)
    res_dict = classification_report(y,model.predict(X),target_names=["No-AF","AF"],output_dict=True)

    append_df = pd.concat([append_df,pd.DataFrame(data=[[model_string,validation_type_string,patient_group_string,
                    res_dict["No-AF"]["support"],res_dict["AF"]["support"],
                    res_dict["No-AF"]["recall"],res_dict["AF"]["recall"],
                    res_dict["No-AF"]["precision"],res_dict["AF"]["precision"],
                    res_dict["No-AF"]["f1-score"],res_dict["AF"]["f1-score"],
                    matthews_corrcoef(y,model.predict(X)),auc(fpr,tpr)
                   ]],columns=mimic_result_dataframe_columns)])
    
    return append_df

mimic_result_dataframe_columns=["model","validation_type","patient_group","NO AF patients","AF patients","NO AF recall","AF recall","NO AF precision","AF precision","NO AF f1","AF f1","mcc","auc"]
# mimic_result_dataframe = pd.DataFrame(columns=mimic_result_dataframe_columns)

mimic_p_value_result_dataframe_columns=["model","comparison","AF_group","p_value"]
# mimic_p_value_result_dataframe = pd.DataFrame(columns=mimic_p_value_result_dataframe_columns)

# missing_dataframe = pd.DataFrame(columns=["model","feature","train amst","balanced test amst","all test amst","train mimic","balanced test mimic","all test mimic"])

In [11]:
#mimic_result_dataframe.reset_index(drop=True).to_csv("../results/mimic_af_prediction_results.csv")

In [12]:
#False
#True

one_half_hour_model = True
six_hour_model = False

model_string = ("1.5"*int(one_half_hour_model)+"12"*int(1-one_half_hour_model))*int(1-six_hour_model)+"6"*int(six_hour_model)
print(model_string)

1.5


## Transfer validation

In [13]:
validation_type_string = "transfer"

### Data import

In [40]:
if one_half_hour_model:
    AF_dataset = pd.read_csv(mimic_extracted_path+"AF_dataset_1_5_hours.csv")
else:

    AF_dataset = pd.read_csv(mimic_extracted_path+"AF_dataset_12_hours.csv")

if six_hour_model:
    AF_dataset = pd.read_csv(mimic_extracted_path+"AF_dataset_6_hours.csv")

        
df_patients = pd.read_csv(mimic_extracted_path+"patients.csv",sep=',')
df_patients = df_patients.rename(columns={"subject_id":"patientid","anchor_age":"Age"})

AF_dataset = AF_dataset.merge(df_patients[['patientid','Age']],how='left',on='patientid')


In [41]:
admissions_pd_mimic = pd.read_csv(mimic_extracted_path+"df_adm_icu.csv",sep=',')
admissions_pd_mimic["urgency"]=0
admissions_pd_mimic.loc[admissions_pd_mimic.admission_type.isin(['DIRECT EMER.', 'EW EMER.', 'URGENT']),"urgency"]=1
AF_dataset = AF_dataset.merge(admissions_pd_mimic[["hadm_id","ethnicity","urgency"]],how='left',on="hadm_id")

In [140]:
AF_dataset["is_given_NORepinefrine"]=0
AF_dataset.loc[AF_dataset['mean_NORepinephrine']>0,"is_given_NORepinefrine"]=1

AF_dataset["cardiac_surg_bool"]=0
AF_dataset.loc[(AF_dataset["cardiac_bool"]==1)&(AF_dataset["surgery_bool"]==1),"cardiac_surg_bool"] = 1

AF_dataset["is_given_Calcium Glubionaat (Calcium Sandoz)"]=0
if "mean_Calcium Carbonate Suspension" in AF_dataset.columns:
    AF_dataset.loc[(AF_dataset["mean_Calcium Carbonate"]>1)|(AF_dataset["mean_Calcium Gluconate"]>1)|(("mean_Calcium Carbonate Suspension" in AF_dataset.columns)&(AF_dataset["mean_Calcium Carbonate Suspension"]>1))|(AF_dataset["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
else:
    AF_dataset.loc[(AF_dataset["mean_Calcium Carbonate"]>1)|(AF_dataset["mean_Calcium Gluconate"]>1)|(AF_dataset["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
    
AF_dataset["is_given_Dopamine (Inotropin)"]=0
AF_dataset.loc[AF_dataset['mean_DOPamine']>0,"is_given_Dopamine (Inotropin)"]=1

AF_dataset['is_given_Magnesiumsulfaat (MgSO4)']=0
AF_dataset.loc[AF_dataset['mean_Magnesium Sulfate']>0,'is_given_Magnesiumsulfaat (MgSO4)']=1

AF_dataset['is_given_Propofol (Diprivan)']=0
AF_dataset.loc[AF_dataset['mean_Propofol']>0,'is_given_Propofol (Diprivan)']=1

AF_dataset['is_given_Fentanyl']=0
AF_dataset.loc[AF_dataset['mean_Fentanyl Citrate']>0,'is_given_Fentanyl']=1

AF_dataset["is_given_Furosemide (Lasix)"]=0
AF_dataset.loc[AF_dataset['mean_Furosemide']>0,"is_given_Furosemide (Lasix)"]=1

AF_dataset["is_given_LoopDiuretics"]=0
AF_dataset.loc[(AF_dataset['mean_Bumetanide']>0)|(AF_dataset['mean_Furosemide']>0),"is_given_LoopDiuretics"]=1


In [141]:
#Rename for distr change 12 hour model
AF_dataset = AF_dataset.rename(columns={'mean_PEEP set':'mean_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'mean_Foley':'mean_UrineCAD', 'max_Foley':'max_UrineCAD', 
                            'min_Foley':'min_UrineCAD', 'slope_Foley':'slope_UrineCAD', 'max_Central Venous Pressure':'max_CVD','mean_Phosphate':'mean_Fosfaat (bloed)', 'max_Phosphate':'max_Fosfaat (bloed)', 
                            'mean_Lactate':'mean_Lactaat (bloed)','max_Lactate':'max_Lactaat (bloed)','min_Lactate':'min_Lactaat (bloed)',
                            'mean_pH':'mean_pH (bloed)','max_pH':'max_pH (bloed)','min_pH':'min_pH (bloed)',
                            'mean_Arterial Base Excess':'mean_B.E. (bloed)', 'mean_Arterial Blood Pressure systolic':'mean_ABP systolisch',
                            'min_Arterial Base Excess':'min_B.E. (bloed)', 'min_Arterial Blood Pressure systolic':'min_ABP systolisch',
                            'max_Arterial Base Excess':'max_B.E. (bloed)', 'max_Arterial Blood Pressure systolic':'max_ABP systolisch',
                            'slope_Arterial Base Excess':'slope_B.E. (bloed)', 'slope_Arterial Blood Pressure systolic':'slope_ABP systolisch',
                            'mean_Inspired O2 Fraction':'mean_O2 concentratie (Set)','min_Inspired O2 Fraction':'min_O2 concentratie (Set)','max_Inspired O2 Fraction':'max_O2 concentratie (Set)',
                            'mean_Oxygen Saturation':'mean_O2-Saturatie (bloed)','min_Oxygen Saturation':'min_O2-Saturatie (bloed)','max_Oxygen Saturation':'max_O2-Saturatie (bloed)',
                            'min_Propofol':'min_Propofol (Diprivan)', 'min_Fentanyl Citrate':'min_Fentanyl', 'mean_O2 Flow':'mean_O2 l/min', 'cardiac_bool_new':'cardio_surgery_new',
                            'mean_Phosphate':'mean_Fosfaat (bloed)', 'kurt_Central Venous Pressure':'kurt_CVD', 'max_Heart Rate':'max_Hartfrequentie', "max_Platelet Count":"max_Thrombo's (bloed)"})

#Rename for distr change 6 hour model
AF_dataset = AF_dataset.rename(columns={'min_PEEP set':'min_PEEP (Set)','max_PEEP set':'max_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'max_O2 Flow':'max_O2 l/min', 'slope_O2 Flow':'slope_O2 l/min', 
                                                    'cardiac_bool_new':'cardio_surgery_new','min_Phosphate':'min_Fosfaat (bloed)', 
                                                    'min_Arterial Blood Pressure mean': 'min_ABP gemiddeld','max_Arterial Blood Pressure mean': 'max_ABP gemiddeld',
                                                    'slope_Arterial Blood Pressure mean': 'slope_ABP gemiddeld',
                                                    'slope_Bicarbonate':'slope_Act.HCO3 (bloed)','max_Bicarbonate':'max_Act.HCO3 (bloed)','min_Bicarbonate':'min_Act.HCO3 (bloed)',
                                                    'mean_Bicarbonate':'mean_Act.HCO3 (bloed)',
                                                    'mean_Arterial Blood Pressure mean': 'mean_ABP gemiddeld',"slope_Platelet Count":"slope_Thrombo's (bloed)",
                                                    'mean_Central Venous Pressure':'mean_CVD', 'min_Heart Rate':'min_Hartfrequentie', "min_Platelet Count":"min_Thrombo's (bloed)"})

#Rename for distr change 1.5 hour model
AF_dataset = AF_dataset.rename(columns={ 'max_O2 Flow':'max_O2 l/min','min_Phosphate':'min_Fosfaat (bloed)','max_Urea Nitrogen':'max_Ureum (bloed)',
                                            'mean_Urea Nitrogen':'mean_Ureum (bloed)','min_Urea Nitrogen':'min_Ureum (bloed)', 'slope_Heart Rate':'slope_Hartfrequentie','mean_Heart Rate':'mean_Hartfrequentie',
                                                    'mean_Central Venous Pressure':'mean_CVD'})

#Rename for standard 12 hour model
AF_dataset = AF_dataset.rename(columns={"mean_Hemoglobin":"mean_Hb (bloed)","slope_pO2":'slope_PO2 (bloed)',"mean_pO2":'mean_PO2 (bloed)',"max_pO2":'max_PO2 (bloed)',
                                        'mean_pCO2':'mean_pCO2 (bloed)','slope_pCO2':'slope_pCO2 (bloed)','min_pCO2':'min_pCO2 (bloed)','max_pCO2':'max_pCO2 (bloed)',
                                        "min_pO2":'min_PO2 (bloed)'})

#Rename for standard 1.5 hour model
AF_dataset = AF_dataset.rename(columns={ 'min_O2 Flow':'min_O2 l/min', "mean_Platelet Count":"mean_Thrombo's (bloed)","mean_PTT":"mean_APTT  (bloed)",
                                        'min_Central Venous Pressure':'min_CVD',"min_C-Reactive Protein":"min_CRP (bloed)"})
                                        

In [142]:
# AF_dataset[AF_dataset.AF==1].AF_measuredat.values/1000/60/60

In [143]:
AF_temp_db = AF_dataset[(AF_dataset.AF==0)&(AF_dataset.AF_orig==0)&(AF_dataset.admissionid!=AF_dataset.date_corresponds_to_AF_admid)]

AF_NOW = AF_temp_db.append(AF_dataset[AF_dataset.AF==1]).reset_index(drop=True)

In [144]:
from sklearn.model_selection import train_test_split

train_patientid,test_patientid = train_test_split(AF_NOW.patientid.unique(), test_size=0.2, random_state=42)#,stratify=AF_NOW["AF"])

train_AF_dataset = AF_NOW[AF_NOW.patientid.isin(train_patientid)]
test_AF_dataset = AF_NOW[AF_NOW.patientid.isin(test_patientid)]

In [145]:
X_train_or = train_AF_dataset#[train_AF_dataset.columns.drop(drop_columns)]
X_test_or = test_AF_dataset#[test_AF_dataset.columns.drop(drop_columns)]
y_train = train_AF_dataset.AF
y_test = test_AF_dataset.AF

In [146]:
X_train_or = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))]#[train_AF_dataset.columns.drop(drop_columns)]
X_test_or = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))]#[test_AF_dataset.columns.drop(drop_columns)]
y_train = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))].AF
y_test = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))].AF

### Testing

In [147]:
if one_half_hour_model:
    feature_columns = ['Age',
        'mean_Lactaat (bloed)',
        'slope_Hartfrequentie',
        'mean_UrineCAD',
        'mean_O2-Saturatie (bloed)',
        'max_PO2 (bloed)',
        'mean_O2 concentratie (Set)',
        'urgency',
        "min_Thrombo's (bloed)",
        'is_given_Noradrenaline (Norepinefrine)',
        'min_Ureum (bloed)',
        'is_given_Furosemide (Lasix)',
        'min_Act.HCO3 (bloed)',
        'Weight',
        'mean_PEEP (Set)',
        'fluid_balance',
        'mean_CVD',
        'slope_ABP systolisch',
        'min_pH (bloed)',
        'slope_ABP gemiddeld',
        ]

elif six_hour_model:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'is_given_Noradrenaline (Norepinefrine)',
        'mean_Lactaat (bloed)',
        'min_UrineCAD',
        'mean_O2 concentratie (Set)',
        'max_PO2 (bloed)',
        'min_CVD',
        'is_given_Furosemide (Lasix)',
        'fluid_balance',
        'min_ABP gemiddeld',
        'mean_PEEP (Set)',
        'mean_Act.HCO3 (bloed)',
        'min_ABP systolisch',
        'urgency',
        'slope_B.E. (bloed)']

else:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'mean_O2 concentratie (Set)',
        'mean_UrineCAD',
        'max_PO2 (bloed)',
        'max_CVD',
        'mean_Lactaat (bloed)',
        'min_pH (bloed)',
        'mean_O2-Saturatie (bloed)',
        'urgency',
        'mean_PEEP (Set)']

X_train=X_train_or[feature_columns]    
X_test=X_test_or[feature_columns]   

In [148]:
AF_class_balance = [y_train.sum()/len(y_train),1-y_train.sum()/len(y_train)]

if one_half_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, l2_leaf_reg=6, class_weights=AF_class_balance)
elif six_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=400, depth=5, l2_leaf_reg=2, class_weights=AF_class_balance)
else:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, depth=4, l2_leaf_reg=2, class_weights=AF_class_balance)

CB_AF.fit(X_train,y_train)

0:	learn: 0.6873364	total: 2.68ms	remaining: 1.07s
100:	learn: 0.5635846	total: 297ms	remaining: 878ms
200:	learn: 0.5435286	total: 576ms	remaining: 570ms
300:	learn: 0.5309451	total: 856ms	remaining: 281ms
399:	learn: 0.5190707	total: 1.13s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x16e6ec206a0>

In [149]:
print("BALANCED TRAIN PATIENTS")
print(classification_report(y_train,CB_AF.predict(X_train),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_train,label=y_train), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_train,CB_AF.predict(X_train)),4)))
print(50*"=")

BALANCED TRAIN PATIENTS
              precision    recall  f1-score   support

       No-AF       0.74      0.72      0.73      4796
          AF       0.74      0.76      0.75      4996

    accuracy                           0.74      9792
   macro avg       0.74      0.74      0.74      9792
weighted avg       0.74      0.74      0.74      9792



AUC = 0.824
MCC = 0.4752


In [150]:
print("BALANCED TEST PATIENTS")
print(classification_report(y_test,CB_AF.predict(X_test),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test,label=y_test), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test,CB_AF.predict(X_test)),4)))
print(50*"=")

BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.67      0.67      0.67      1217
          AF       0.68      0.68      0.68      1253

    accuracy                           0.68      2470
   macro avg       0.68      0.68      0.68      2470
weighted avg       0.68      0.68      0.68      2470



AUC = 0.7529
MCC = 0.3513


In [151]:
transfer_y_test = y_test
transfer_predict = CB_AF.predict(X_test)

In [152]:
temp_large_db = AF_dataset[((AF_dataset.AF==0)&(~AF_dataset.patientid.isin(AF_NOW.patientid)))|(AF_dataset.patientid.isin(test_AF_dataset.patientid))]

y_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))].AF
X_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))][feature_columns]


print("ALL PATIENTS")
print(classification_report(y_test_all_pat,CB_AF.predict(X_test_all_pat),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_all_pat,label=y_test_all_pat), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_all_pat,CB_AF.predict(X_test_all_pat)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("all patients",X_test_all_pat,y_test_all_pat,CB_AF,mimic_result_dataframe)

print("BALANCED TEST PATIENTS")
print(classification_report(y_test,CB_AF.predict(X_test),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test,label=y_test), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test,CB_AF.predict(X_test)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("balanced test",X_test,y_test,CB_AF,mimic_result_dataframe)

ALL PATIENTS
              precision    recall  f1-score   support

       No-AF       0.99      0.65      0.79     53203
          AF       0.04      0.68      0.08      1253

    accuracy                           0.65     54456
   macro avg       0.52      0.67      0.43     54456
weighted avg       0.97      0.65      0.77     54456



AUC = 0.7426
MCC = 0.104
BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.67      0.67      0.67      1217
          AF       0.68      0.68      0.68      1253

    accuracy                           0.68      2470
   macro avg       0.68      0.68      0.68      2470
weighted avg       0.68      0.68      0.68      2470



AUC = 0.7529
MCC = 0.3513


## Combo validation 

In [153]:
validation_type_string = "combo"

### Data Import

In [154]:
if one_half_hour_model:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_1_5_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_1_5_hours.csv") 
elif six_hour_model:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_6_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_6_hours.csv") 
else:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_12_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_12_hours.csv") 

        
df_patients = pd.read_csv(mimic_extracted_path+"patients.csv",sep=',')
df_patients = df_patients.rename(columns={"subject_id":"patientid","anchor_age":"Age"})

AF_dataset_mimic = AF_dataset_mimic.merge(df_patients[['patientid','Age']],how='left',on='patientid')

admissions_pd_mimic = pd.read_csv(mimic_extracted_path+"df_adm_icu.csv",sep=',')
admissions_pd_mimic["urgency"]=0
admissions_pd_mimic.loc[admissions_pd_mimic.admission_type.isin(['DIRECT EMER.', 'EW EMER.', 'URGENT']),"urgency"]=1
AF_dataset_mimic = AF_dataset_mimic.merge(admissions_pd_mimic[["hadm_id","ethnicity","urgency"]],how='left',on="hadm_id")

In [155]:
AF_dataset_mimic["is_given_NORepinefrine"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_NORepinephrine']>0,"is_given_NORepinefrine"]=1

AF_dataset_mimic["cardiac_surg_bool"]=0
AF_dataset_mimic.loc[(AF_dataset_mimic["cardiac_bool"]==1)&(AF_dataset_mimic["surgery_bool"]==1),"cardiac_surg_bool"] = 1

AF_dataset_mimic["is_given_Calcium Glubionaat (Calcium Sandoz)"]=0
if "mean_Calcium Carbonate Suspension" in AF_dataset.columns:
    AF_dataset_mimic.loc[(AF_dataset_mimic["mean_Calcium Carbonate"]>1)|(AF_dataset_mimic["mean_Calcium Gluconate"]>1)|(("mean_Calcium Carbonate Suspension" in AF_dataset_mimic.columns)&(AF_dataset_mimic["mean_Calcium Carbonate Suspension"]>1))|(AF_dataset_mimic["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
else:
    AF_dataset_mimic.loc[(AF_dataset_mimic["mean_Calcium Carbonate"]>1)|(AF_dataset_mimic["mean_Calcium Gluconate"]>1)|(AF_dataset_mimic["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
    
AF_dataset_mimic["is_given_Dopamine (Inotropin)"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_DOPamine']>0,"is_given_Dopamine (Inotropin)"]=1

AF_dataset_mimic[ 'is_given_Magnesiumsulfaat (MgSO4)']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Magnesium Sulfate']>0, 'is_given_Magnesiumsulfaat (MgSO4)']=1

AF_dataset_mimic['is_given_Propofol (Diprivan)']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Propofol']>0,'is_given_Propofol (Diprivan)']=1

AF_dataset_mimic['is_given_Fentanyl']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Fentanyl Citrate']>0,'is_given_Fentanyl']=1

AF_dataset_mimic["is_given_Furosemide (Lasix)"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Furosemide']>0,"is_given_Furosemide (Lasix)"]=1

AF_dataset_mimic["is_given_LoopDiuretics"]=0
AF_dataset_mimic.loc[(AF_dataset_mimic['mean_Bumetanide']>0)|(AF_dataset_mimic['mean_Furosemide']>0),"is_given_LoopDiuretics"]=1

In [156]:
AF_dataset["is_given_Magnesiumsulfaat (MgSO4)"]=0
AF_dataset.loc[AF_dataset['mean_Magnesiumsulfaat (MgSO4)']>0,"is_given_Magnesiumsulfaat (MgSO4)"]=1
#AF_dataset["min_Magnesiumsulfaat (MgSO4)"] = AF_dataset["min_Magnesiumsulfaat (MgSO4)"].fillna(0)
AF_dataset["is_given_Calcium Glubionaat (Calcium Sandoz)"]=0
AF_dataset.loc[AF_dataset['mean_Calcium Glubionaat (Calcium Sandoz)']>0,"is_given_Calcium Glubionaat (Calcium Sandoz)"]=1

AF_dataset["is_given_LoopDiuretics"]=0
# AF_dataset.loc[(AF_dataset['mean_Bumetanide (Burinex)']>0) | (AF_dataset['mean_Furosemide (Lasix)']>0),"is_given_LoopDiuretics"]=1
AF_dataset.loc[(AF_dataset['mean_Furosemide (Lasix)']>0),"is_given_LoopDiuretics"]=1

AF_dataset['is_given_Propofol (Diprivan)']=0
AF_dataset.loc[AF_dataset['mean_Propofol (Diprivan)']>0,'is_given_Propofol (Diprivan)']=1

AF_dataset['is_given_Dopamine (Inotropin)']=0
AF_dataset.loc[AF_dataset['mean_Dopamine (Inotropin)']>0,'is_given_Dopamine (Inotropin)']=1

AF_dataset['is_given_Enoximon (Perfan)']=0
AF_dataset.loc[AF_dataset['mean_Enoximon (Perfan)']>0,'is_given_Enoximon (Perfan)']=1

AF_dataset.loc[AF_dataset['mean_PEEP (Set)'].isna(),'mean_PEEP (Set)']=0

AF_dataset['is_given_Hydrocortison (Solu Cortef)']=0
AF_dataset.loc[AF_dataset['mean_Hydrocortison (Solu Cortef)']>0,'is_given_Hydrocortison (Solu Cortef)']=1

AF_dataset['is_given_Midazolam (Dormicum)']=0
AF_dataset.loc[AF_dataset['mean_Midazolam (Dormicum)']>0,'is_given_Midazolam (Dormicum)']=1

AF_dataset['is_given_Morfine']=0
AF_dataset.loc[AF_dataset['mean_Morfine']>0,'is_given_Morfine']=1

AF_dataset['is_given_Fentanyl']=0
AF_dataset.loc[AF_dataset['mean_Fentanyl']>0,'is_given_Fentanyl']=1

In [157]:
#Rename for distr change 12 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={'mean_PEEP set':'mean_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'mean_Foley':'mean_UrineCAD', 'max_Foley':'max_UrineCAD', 
                            'min_Foley':'min_UrineCAD', 'slope_Foley':'slope_UrineCAD', 'max_Central Venous Pressure':'max_CVD','mean_Phosphate':'mean_Fosfaat (bloed)', 'max_Phosphate':'max_Fosfaat (bloed)', 
                            'mean_Lactate':'mean_Lactaat (bloed)','max_Lactate':'max_Lactaat (bloed)','min_Lactate':'min_Lactaat (bloed)',
                            'mean_pH':'mean_pH (bloed)','max_pH':'max_pH (bloed)','min_pH':'min_pH (bloed)',
                            'mean_Arterial Base Excess':'mean_B.E. (bloed)', 'mean_Arterial Blood Pressure systolic':'mean_ABP systolisch',
                            'min_Arterial Base Excess':'min_B.E. (bloed)', 'min_Arterial Blood Pressure systolic':'min_ABP systolisch',
                            'max_Arterial Base Excess':'max_B.E. (bloed)', 'max_Arterial Blood Pressure systolic':'max_ABP systolisch',
                            'slope_Arterial Base Excess':'slope_B.E. (bloed)', 'slope_Arterial Blood Pressure systolic':'slope_ABP systolisch',
                            'mean_Inspired O2 Fraction':'mean_O2 concentratie (Set)','min_Inspired O2 Fraction':'min_O2 concentratie (Set)','max_Inspired O2 Fraction':'max_O2 concentratie (Set)',
                            'mean_Oxygen Saturation':'mean_O2-Saturatie (bloed)','min_Oxygen Saturation':'min_O2-Saturatie (bloed)','max_Oxygen Saturation':'max_O2-Saturatie (bloed)',
                            'min_Propofol':'min_Propofol (Diprivan)', 'min_Fentanyl Citrate':'min_Fentanyl', 'mean_O2 Flow':'mean_O2 l/min', 'cardiac_bool_new':'cardio_surgery_new',
                            'mean_Phosphate':'mean_Fosfaat (bloed)', 'kurt_Central Venous Pressure':'kurt_CVD', 'max_Heart Rate':'max_Hartfrequentie', "max_Platelet Count":"max_Thrombo's (bloed)"})

#Rename for distr change 6 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={'min_PEEP set':'min_PEEP (Set)','max_PEEP set':'max_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'max_O2 Flow':'max_O2 l/min', 'slope_O2 Flow':'slope_O2 l/min', 
                                                    'cardiac_bool_new':'cardio_surgery_new','min_Phosphate':'min_Fosfaat (bloed)', 
                                                    'min_Arterial Blood Pressure mean': 'min_ABP gemiddeld','max_Arterial Blood Pressure mean': 'max_ABP gemiddeld',
                                                    'slope_Arterial Blood Pressure mean': 'slope_ABP gemiddeld',
                                                    'slope_Bicarbonate':'slope_Act.HCO3 (bloed)','max_Bicarbonate':'max_Act.HCO3 (bloed)','min_Bicarbonate':'min_Act.HCO3 (bloed)',
                                                    'mean_Bicarbonate':'mean_Act.HCO3 (bloed)',
                                                    'mean_Arterial Blood Pressure mean': 'mean_ABP gemiddeld',"slope_Platelet Count":"slope_Thrombo's (bloed)",
                                                    'mean_Central Venous Pressure':'mean_CVD', 'min_Heart Rate':'min_Hartfrequentie', "min_Platelet Count":"min_Thrombo's (bloed)"})

#Rename for distr change 1.5 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={ 'max_O2 Flow':'max_O2 l/min','min_Phosphate':'min_Fosfaat (bloed)','max_Urea Nitrogen':'max_Ureum (bloed)',
                                            'mean_Urea Nitrogen':'mean_Ureum (bloed)','min_Urea Nitrogen':'min_Ureum (bloed)', 'slope_Heart Rate':'slope_Hartfrequentie','mean_Heart Rate':'mean_Hartfrequentie',
                                                    'mean_Central Venous Pressure':'mean_CVD'})

#Rename for standard 12 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={"mean_Hemoglobin":"mean_Hb (bloed)","slope_pO2":'slope_PO2 (bloed)',"mean_pO2":'mean_PO2 (bloed)',"max_pO2":'max_PO2 (bloed)',
                                        'mean_pCO2':'mean_pCO2 (bloed)','slope_pCO2':'slope_pCO2 (bloed)','min_pCO2':'min_pCO2 (bloed)','max_pCO2':'max_pCO2 (bloed)',
                                        "min_pO2":'min_PO2 (bloed)'})

#Rename for standard 1.5 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={ 'min_O2 Flow':'min_O2 l/min', "mean_Platelet Count":"mean_Thrombo's (bloed)","mean_PTT":"mean_APTT  (bloed)",
                                        'min_Central Venous Pressure':'min_CVD',"min_C-Reactive Protein":"min_CRP (bloed)"})

In [158]:
AF_temp_db_mimic = AF_dataset_mimic[(AF_dataset_mimic.AF==0)&(AF_dataset_mimic.AF_orig==0)&(AF_dataset_mimic.admissionid!=AF_dataset_mimic.date_corresponds_to_AF_admid)]

AF_temp_db = AF_dataset[(AF_dataset.AF==0)&(AF_dataset.AF_orig==0)&(AF_dataset.admissionid!=AF_dataset.date_corresponds_to_AF_admid)]

AF_NOW_mimic = AF_temp_db_mimic.append(AF_dataset_mimic[AF_dataset_mimic.AF==1]).reset_index(drop=True)
AF_NOW = AF_temp_db.append(AF_dataset[AF_dataset.AF==1]).reset_index(drop=True)

In [159]:
from sklearn.model_selection import train_test_split

train_patientid_mimic,test_patientid_mimic = train_test_split(AF_NOW_mimic.patientid.unique(), test_size=0.2, random_state=42)
train_patientid,test_patientid = train_test_split(AF_NOW.patientid.unique(), test_size=0.2, random_state=42)

train_AF_dataset = AF_NOW[AF_NOW.patientid.isin(train_patientid)]
train_AF_dataset_mimic = AF_NOW_mimic[AF_NOW_mimic.patientid.isin(train_patientid_mimic)]
test_AF_dataset = AF_NOW[AF_NOW.patientid.isin(test_patientid)]
test_AF_dataset_mimic = AF_NOW_mimic[AF_NOW_mimic.patientid.isin(test_patientid_mimic)]

In [160]:
mimic_drop_columns = ['patientid', 'hadm_id', 'admittime', 'AF_measuredat', 'intime',
       'outtime', 'admissionid', 'lengthofstay']

drop_columns = ["dateofdeath_delta","admittedat_delta","admissionid","origin",
                "lengthofstay","destination","weightgroup","agegroup","dateofdeath",
                "admittedat","heightgroup","specialty","dateofdeath_delta","admittedat_delta","weightsource","dischargedat","heightsource",
                "gender","Mortality","AF_orig","AF_measuredat","AF","new_onset_AF","Preadmission_AF","patientid","location","admissionyeargroup"]

mimic_drop_columns.append('date_corresponds_to_AF_admid')
drop_columns.append('date_corresponds_to_AF_admid')

In [161]:
X_train_or_amst = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))]#[train_AF_dataset.columns.drop(drop_columns)]
X_test_or_amst = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))]#[test_AF_dataset.columns.drop(mimic_drop_columns)]
y_train_amst = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))].AF
y_test_amst = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))].AF

X_train_or_mimic = train_AF_dataset_mimic[(train_AF_dataset_mimic.AF==1)|((train_AF_dataset_mimic.AF_orig==0)&(train_AF_dataset_mimic.AF==0))]
X_test_or_mimic = test_AF_dataset_mimic[(test_AF_dataset_mimic.AF==1)|((test_AF_dataset_mimic.AF_orig==0)&(test_AF_dataset_mimic.AF==0))]
y_train_mimic = train_AF_dataset_mimic[(train_AF_dataset_mimic.AF==1)|((train_AF_dataset_mimic.AF_orig==0)&(train_AF_dataset_mimic.AF==0))].AF
y_test_mimic = test_AF_dataset_mimic[(test_AF_dataset_mimic.AF==1)|((test_AF_dataset_mimic.AF_orig==0)&(test_AF_dataset_mimic.AF==0))].AF

### Testing

In [162]:
if one_half_hour_model:
    feature_columns = ['Age',
        'mean_Lactaat (bloed)',
        'slope_Hartfrequentie',
        'mean_UrineCAD',
        'mean_O2-Saturatie (bloed)',
        'max_PO2 (bloed)',
        'mean_O2 concentratie (Set)',
        'urgency',
        "min_Thrombo's (bloed)",
        'is_given_Noradrenaline (Norepinefrine)',
        'min_Ureum (bloed)',
        'is_given_Furosemide (Lasix)',
        'min_Act.HCO3 (bloed)',
        'Weight',
        'mean_PEEP (Set)',
        'fluid_balance',
        'mean_CVD',
        'slope_ABP systolisch',
        'min_pH (bloed)',
        'slope_ABP gemiddeld',
        ]

elif six_hour_model:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'is_given_Noradrenaline (Norepinefrine)',
        'mean_Lactaat (bloed)',
        'min_UrineCAD',
        'mean_O2 concentratie (Set)',
        'max_PO2 (bloed)',
        'min_CVD',
        'is_given_Furosemide (Lasix)',
        'fluid_balance',
        'min_ABP gemiddeld',
        'mean_PEEP (Set)',
        'mean_Act.HCO3 (bloed)',
        'min_ABP systolisch',
        'urgency',
        'slope_B.E. (bloed)']

else:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'mean_O2 concentratie (Set)',
        'mean_UrineCAD',
        'max_PO2 (bloed)',
        'max_CVD',
        'mean_Lactaat (bloed)',
        'min_pH (bloed)',
        'mean_O2-Saturatie (bloed)',
        'urgency',
        'mean_PEEP (Set)']
        
X_train_amst = X_train_or_amst[feature_columns]
X_test_amst = X_test_or_amst[feature_columns]
X_train_mimic = X_train_or_mimic[feature_columns]
X_test_mimic = X_test_or_mimic[feature_columns]

X_train = X_train_amst.append(X_train_mimic).reset_index(drop=True).sample(
    len(X_train_amst)+len(X_train_mimic), random_state=42)
#X_train = X_train_amst
#X_train = X_train_mimic

y_train = np.append(y_train_amst, y_train_mimic)[X_train.index.values]
#y_train = y_train_amst
#y_train = y_train_mimic


In [163]:
AF_class_balance = [y_train.sum()/len(y_train),1-y_train.sum()/len(y_train)]
if one_half_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, l2_leaf_reg=6, class_weights=AF_class_balance)
elif six_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=400, depth=5, l2_leaf_reg=2, class_weights=AF_class_balance)
else:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, depth=4, l2_leaf_reg=2, class_weights=AF_class_balance)

CB_AF.fit(X_train,y_train)

0:	learn: 0.6874910	total: 3.45ms	remaining: 1.38s
100:	learn: 0.5716608	total: 366ms	remaining: 1.08s
200:	learn: 0.5544851	total: 733ms	remaining: 726ms
300:	learn: 0.5436401	total: 1.08s	remaining: 356ms
399:	learn: 0.5324397	total: 1.42s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x16e6809c880>

In [164]:
print(classification_report(y_train,CB_AF.predict(X_train),target_names=["No-AF","AF"]))

              precision    recall  f1-score   support

       No-AF       0.73      0.70      0.71      6101
          AF       0.72      0.76      0.74      6357

    accuracy                           0.73     12458
   macro avg       0.73      0.73      0.73     12458
weighted avg       0.73      0.73      0.73     12458



In [165]:
print("BALANCED TEST AMSTERDAM PATIENTS")
print(classification_report(y_test_amst,CB_AF.predict(X_test_amst),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_amst,label=y_test_amst), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_amst,CB_AF.predict(X_test_amst)),4)))
print(50*"=")

BALANCED TEST AMSTERDAM PATIENTS
              precision    recall  f1-score   support

       No-AF       0.75      0.70      0.72       350
          AF       0.69      0.74      0.71       311

    accuracy                           0.72       661
   macro avg       0.72      0.72      0.72       661
weighted avg       0.72      0.72      0.72       661



AUC = 0.7842
MCC = 0.4356


In [166]:
print("BALANCED TEST MIMIC PATIENTS")
print(classification_report(y_test_mimic,CB_AF.predict(X_test_mimic),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_mimic,label=y_test_mimic), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_mimic,CB_AF.predict(X_test_mimic)),4)))
print(50*"=")

BALANCED TEST MIMIC PATIENTS
              precision    recall  f1-score   support

       No-AF       0.67      0.67      0.67      1217
          AF       0.68      0.68      0.68      1253

    accuracy                           0.68      2470
   macro avg       0.68      0.68      0.68      2470
weighted avg       0.68      0.68      0.68      2470



AUC = 0.751
MCC = 0.3521


In [167]:
combo_mimic_y_test = y_test_mimic
combo_mimic_predict = CB_AF.predict(X_test_mimic)
combo_amst_y_test = y_test_amst
combo_amst_predict = CB_AF.predict(X_test_amst)

In [168]:
temp_large_db = AF_dataset_mimic[((AF_dataset_mimic.AF==0)&(~AF_dataset_mimic.patientid.isin(AF_NOW_mimic.patientid)))|(AF_dataset_mimic.patientid.isin(test_patientid_mimic))]

y_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))].AF
X_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))][feature_columns]


print("ALL PATIENTS")
print(classification_report(y_test_all_pat,CB_AF.predict(X_test_all_pat),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_all_pat,label=y_test_all_pat), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_all_pat,CB_AF.predict(X_test_all_pat)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("all patients mimic",X_test_all_pat,y_test_all_pat,CB_AF,mimic_result_dataframe)


print("BALANCED TEST PATIENTS")
print(classification_report(y_test_mimic,CB_AF.predict(X_test_mimic),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_mimic,label=y_test_mimic), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_mimic,CB_AF.predict(X_test_mimic)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("balanced test mimic",X_test,y_test,CB_AF,mimic_result_dataframe)

ALL PATIENTS
              precision    recall  f1-score   support

       No-AF       0.99      0.65      0.79     53203
          AF       0.04      0.68      0.08      1253

    accuracy                           0.66     54456
   macro avg       0.52      0.67      0.44     54456
weighted avg       0.97      0.66      0.77     54456



AUC = 0.7415
MCC = 0.105
BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.67      0.67      0.67      1217
          AF       0.68      0.68      0.68      1253

    accuracy                           0.68      2470
   macro avg       0.68      0.68      0.68      2470
weighted avg       0.68      0.68      0.68      2470



AUC = 0.751
MCC = 0.3521


In [169]:
temp_large_db = AF_dataset[((AF_dataset.AF==0)&(~AF_dataset.patientid.isin(AF_NOW.patientid)))|(AF_dataset.patientid.isin(test_patientid))]

y_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))].AF
X_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))][feature_columns]


print("ALL PATIENTS")
print(classification_report(y_test_all_pat,CB_AF.predict(X_test_all_pat),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_all_pat,label=y_test_all_pat), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_all_pat,CB_AF.predict(X_test_all_pat)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("all patients amsterdam",X_test_all_pat,y_test_all_pat,CB_AF,mimic_result_dataframe)


print("BALANCED TEST PATIENTS")
print(classification_report(y_test_amst,CB_AF.predict(X_test_amst),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_amst,label=y_test_amst), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_amst,CB_AF.predict(X_test_amst)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("balanced test amsterdam",X_test,y_test,CB_AF,mimic_result_dataframe)

ALL PATIENTS
              precision    recall  f1-score   support

       No-AF       0.99      0.59      0.74     13320
          AF       0.04      0.74      0.08       311

    accuracy                           0.59     13631
   macro avg       0.51      0.66      0.41     13631
weighted avg       0.97      0.59      0.72     13631



AUC = 0.7364
MCC = 0.0988
BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.75      0.70      0.72       350
          AF       0.69      0.74      0.71       311

    accuracy                           0.72       661
   macro avg       0.72      0.72      0.72       661
weighted avg       0.72      0.72      0.72       661



AUC = 0.7842
MCC = 0.4356


## Direct validation

In [170]:
validation_type_string = "direct"

### Data Import

In [171]:
if one_half_hour_model:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_1_5_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_1_5_hours.csv") 
elif six_hour_model:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_6_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_6_hours.csv") 
else:
        AF_dataset_mimic = pd.read_csv(mimic_extracted_path+"AF_dataset_12_hours.csv") 
        AF_dataset = pd.read_csv(amsterdam_data_path+"AF_dataset_12_hours.csv") 


df_patients = pd.read_csv(mimic_extracted_path+"patients.csv",sep=',')
df_patients = df_patients.rename(columns={"subject_id":"patientid","anchor_age":"Age"})

AF_dataset_mimic = AF_dataset_mimic.merge(df_patients[['patientid','Age']],how='left',on='patientid')

admissions_pd_mimic = pd.read_csv(mimic_extracted_path+"df_adm_icu.csv",sep=',')
admissions_pd_mimic["urgency"]=0
admissions_pd_mimic.loc[admissions_pd_mimic.admission_type.isin(['DIRECT EMER.', 'EW EMER.', 'URGENT']),"urgency"]=1
AF_dataset_mimic = AF_dataset_mimic.merge(admissions_pd_mimic[["hadm_id","ethnicity","urgency"]],how='left',on="hadm_id")

In [172]:
AF_dataset_mimic["is_given_NORepinefrine"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_NORepinephrine']>0,"is_given_NORepinefrine"]=1

AF_dataset_mimic["cardiac_surg_bool"]=0
AF_dataset_mimic.loc[(AF_dataset_mimic["cardiac_bool"]==1)&(AF_dataset_mimic["surgery_bool"]==1),"cardiac_surg_bool"] = 1

AF_dataset_mimic["is_given_Calcium Glubionaat (Calcium Sandoz)"]=0
if "mean_Calcium Carbonate Suspension" in AF_dataset.columns:
    AF_dataset_mimic.loc[(AF_dataset_mimic["mean_Calcium Carbonate"]>1)|(AF_dataset_mimic["mean_Calcium Gluconate"]>1)|(("mean_Calcium Carbonate Suspension" in AF_dataset_mimic.columns)&(AF_dataset_mimic["mean_Calcium Carbonate Suspension"]>1))|(AF_dataset_mimic["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
else:
    AF_dataset_mimic.loc[(AF_dataset_mimic["mean_Calcium Carbonate"]>1)|(AF_dataset_mimic["mean_Calcium Gluconate"]>1)|(AF_dataset_mimic["mean_Calcium Acetate"]>1),"is_given_Calcium Glubionaat (Calcium Sandoz)"] = 1
    
AF_dataset_mimic["is_given_Dopamine (Inotropin)"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_DOPamine']>0,"is_given_Dopamine (Inotropin)"]=1

AF_dataset_mimic[ 'is_given_Magnesiumsulfaat (MgSO4)']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Magnesium Sulfate']>0, 'is_given_Magnesiumsulfaat (MgSO4)']=1

AF_dataset_mimic['is_given_Propofol (Diprivan)']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Propofol']>0,'is_given_Propofol (Diprivan)']=1

AF_dataset_mimic['is_given_Fentanyl']=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Fentanyl Citrate']>0,'is_given_Fentanyl']=1

AF_dataset_mimic["is_given_Furosemide (Lasix)"]=0
AF_dataset_mimic.loc[AF_dataset_mimic['mean_Furosemide']>0,"is_given_Furosemide (Lasix)"]=1

AF_dataset_mimic["is_given_LoopDiuretics"]=0
AF_dataset_mimic.loc[(AF_dataset_mimic['mean_Bumetanide']>0)|(AF_dataset_mimic['mean_Furosemide']>0),"is_given_LoopDiuretics"]=1

In [173]:
AF_dataset["is_given_Magnesiumsulfaat (MgSO4)"]=0
AF_dataset.loc[AF_dataset['mean_Magnesiumsulfaat (MgSO4)']>0,"is_given_Magnesiumsulfaat (MgSO4)"]=1
#AF_dataset["min_Magnesiumsulfaat (MgSO4)"] = AF_dataset["min_Magnesiumsulfaat (MgSO4)"].fillna(0)
AF_dataset["is_given_Calcium Glubionaat (Calcium Sandoz)"]=0
AF_dataset.loc[AF_dataset['mean_Calcium Glubionaat (Calcium Sandoz)']>0,"is_given_Calcium Glubionaat (Calcium Sandoz)"]=1

AF_dataset["is_given_LoopDiuretics"]=0
# AF_dataset.loc[(AF_dataset['mean_Bumetanide (Burinex)']>0) | (AF_dataset['mean_Furosemide (Lasix)']>0),"is_given_LoopDiuretics"]=1
AF_dataset.loc[(AF_dataset['mean_Furosemide (Lasix)']>0),"is_given_LoopDiuretics"]=1

AF_dataset['is_given_Propofol (Diprivan)']=0
AF_dataset.loc[AF_dataset['mean_Propofol (Diprivan)']>0,'is_given_Propofol (Diprivan)']=1

AF_dataset['is_given_Dopamine (Inotropin)']=0
AF_dataset.loc[AF_dataset['mean_Dopamine (Inotropin)']>0,'is_given_Dopamine (Inotropin)']=1

AF_dataset['is_given_Enoximon (Perfan)']=0
AF_dataset.loc[AF_dataset['mean_Enoximon (Perfan)']>0,'is_given_Enoximon (Perfan)']=1

AF_dataset.loc[AF_dataset['mean_PEEP (Set)'].isna(),'mean_PEEP (Set)']=0

AF_dataset['is_given_Hydrocortison (Solu Cortef)']=0
AF_dataset.loc[AF_dataset['mean_Hydrocortison (Solu Cortef)']>0,'is_given_Hydrocortison (Solu Cortef)']=1

AF_dataset['is_given_Midazolam (Dormicum)']=0
AF_dataset.loc[AF_dataset['mean_Midazolam (Dormicum)']>0,'is_given_Midazolam (Dormicum)']=1

AF_dataset['is_given_Morfine']=0
AF_dataset.loc[AF_dataset['mean_Morfine']>0,'is_given_Morfine']=1

AF_dataset['is_given_Fentanyl']=0
AF_dataset.loc[AF_dataset['mean_Fentanyl']>0,'is_given_Fentanyl']=1

In [174]:
#Rename for distr change 12 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={'mean_PEEP set':'mean_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'mean_Foley':'mean_UrineCAD', 'max_Foley':'max_UrineCAD', 
                            'min_Foley':'min_UrineCAD', 'slope_Foley':'slope_UrineCAD', 'max_Central Venous Pressure':'max_CVD','mean_Phosphate':'mean_Fosfaat (bloed)', 'max_Phosphate':'max_Fosfaat (bloed)', 
                            'mean_Lactate':'mean_Lactaat (bloed)','max_Lactate':'max_Lactaat (bloed)','min_Lactate':'min_Lactaat (bloed)',
                            'mean_pH':'mean_pH (bloed)','max_pH':'max_pH (bloed)','min_pH':'min_pH (bloed)',
                            'mean_Arterial Base Excess':'mean_B.E. (bloed)', 'mean_Arterial Blood Pressure systolic':'mean_ABP systolisch',
                            'min_Arterial Base Excess':'min_B.E. (bloed)', 'min_Arterial Blood Pressure systolic':'min_ABP systolisch',
                            'max_Arterial Base Excess':'max_B.E. (bloed)', 'max_Arterial Blood Pressure systolic':'max_ABP systolisch',
                            'slope_Arterial Base Excess':'slope_B.E. (bloed)', 'slope_Arterial Blood Pressure systolic':'slope_ABP systolisch',
                            'mean_Inspired O2 Fraction':'mean_O2 concentratie (Set)','min_Inspired O2 Fraction':'min_O2 concentratie (Set)','max_Inspired O2 Fraction':'max_O2 concentratie (Set)',
                            'mean_Oxygen Saturation':'mean_O2-Saturatie (bloed)','min_Oxygen Saturation':'min_O2-Saturatie (bloed)','max_Oxygen Saturation':'max_O2-Saturatie (bloed)',
                            'min_Propofol':'min_Propofol (Diprivan)', 'min_Fentanyl Citrate':'min_Fentanyl', 'mean_O2 Flow':'mean_O2 l/min', 'cardiac_bool_new':'cardio_surgery_new',
                            'mean_Phosphate':'mean_Fosfaat (bloed)', 'kurt_Central Venous Pressure':'kurt_CVD', 'max_Heart Rate':'max_Hartfrequentie', "max_Platelet Count":"max_Thrombo's (bloed)"})

#Rename for distr change 6 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={'min_PEEP set':'min_PEEP (Set)','max_PEEP set':'max_PEEP (Set)', 'is_given_NORepinefrine':'is_given_Noradrenaline (Norepinefrine)', 'max_O2 Flow':'max_O2 l/min', 'slope_O2 Flow':'slope_O2 l/min', 
                                                    'cardiac_bool_new':'cardio_surgery_new','min_Phosphate':'min_Fosfaat (bloed)', 
                                                    'min_Arterial Blood Pressure mean': 'min_ABP gemiddeld','max_Arterial Blood Pressure mean': 'max_ABP gemiddeld',
                                                    'slope_Arterial Blood Pressure mean': 'slope_ABP gemiddeld',
                                                    'slope_Bicarbonate':'slope_Act.HCO3 (bloed)','max_Bicarbonate':'max_Act.HCO3 (bloed)','min_Bicarbonate':'min_Act.HCO3 (bloed)',
                                                    'mean_Bicarbonate':'mean_Act.HCO3 (bloed)',
                                                    'mean_Arterial Blood Pressure mean': 'mean_ABP gemiddeld',"slope_Platelet Count":"slope_Thrombo's (bloed)",
                                                    'mean_Central Venous Pressure':'mean_CVD', 'min_Heart Rate':'min_Hartfrequentie', "min_Platelet Count":"min_Thrombo's (bloed)"})

#Rename for distr change 1.5 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={ 'max_O2 Flow':'max_O2 l/min','min_Phosphate':'min_Fosfaat (bloed)','max_Urea Nitrogen':'max_Ureum (bloed)',
                                            'mean_Urea Nitrogen':'mean_Ureum (bloed)','min_Urea Nitrogen':'min_Ureum (bloed)', 'slope_Heart Rate':'slope_Hartfrequentie','mean_Heart Rate':'mean_Hartfrequentie',
                                                    'mean_Central Venous Pressure':'mean_CVD'})

#Rename for standard 12 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={"mean_Hemoglobin":"mean_Hb (bloed)","slope_pO2":'slope_PO2 (bloed)',"mean_pO2":'mean_PO2 (bloed)',"max_pO2":'max_PO2 (bloed)',
                                        'mean_pCO2':'mean_pCO2 (bloed)','slope_pCO2':'slope_pCO2 (bloed)','min_pCO2':'min_pCO2 (bloed)','max_pCO2':'max_pCO2 (bloed)',
                                        "min_pO2":'min_PO2 (bloed)'})

#Rename for standard 1.5 hour model
AF_dataset_mimic = AF_dataset_mimic.rename(columns={ 'min_O2 Flow':'min_O2 l/min', "mean_Platelet Count":"mean_Thrombo's (bloed)","mean_PTT":"mean_APTT  (bloed)",
                                        'min_Central Venous Pressure':'min_CVD',"min_C-Reactive Protein":"min_CRP (bloed)"})

In [175]:
AF_dataset_mimic = AF_dataset_mimic.drop_duplicates("admissionid")

In [176]:
AF_temp_db_mimic = AF_dataset_mimic[(AF_dataset_mimic.AF==0)&(AF_dataset_mimic.AF_orig==0)&(AF_dataset_mimic.admissionid!=AF_dataset_mimic.date_corresponds_to_AF_admid)]
AF_NOW_mimic = AF_temp_db_mimic.append(AF_dataset_mimic[(AF_dataset_mimic.AF==1)&(AF_dataset_mimic.admissionid.isin(AF_temp_db_mimic.date_corresponds_to_AF_admid.unique()))]).reset_index(drop=True)

AF_temp_db = AF_dataset[(AF_dataset.AF==0)&(AF_dataset.AF_orig==0)&(AF_dataset.admissionid!=AF_dataset.date_corresponds_to_AF_admid)]
AF_1_temp_db = AF_dataset[(AF_dataset.AF==1)&(AF_dataset.admissionid.isin(AF_temp_db.date_corresponds_to_AF_admid.unique()))]
AF_NOW = AF_temp_db[AF_temp_db.date_corresponds_to_AF_admid.isin(AF_1_temp_db.admissionid.values)].append(AF_1_temp_db).reset_index(drop=True)
    

In [177]:
from sklearn.model_selection import train_test_split

train_patientid_mimic,test_patientid_mimic = train_test_split(AF_NOW_mimic[AF_NOW_mimic.AF==1].admissionid.unique(), test_size=0.2, random_state=42)
train_patientid,test_patientid = train_test_split(AF_NOW[AF_NOW.AF==1].admissionid.unique(), test_size=0.2, random_state=42)

train_AF_dataset = AF_NOW[AF_NOW.date_corresponds_to_AF_admid.isin(train_patientid)]
test_AF_dataset = AF_NOW_mimic[AF_NOW_mimic.date_corresponds_to_AF_admid.isin(test_patientid_mimic)]
test_AF_dataset_amst = AF_NOW[AF_NOW.date_corresponds_to_AF_admid.isin(test_patientid)]

In [178]:
mimic_drop_columns = ['patientid', 'hadm_id', 'admittime', 'AF_measuredat', 'intime',
       'outtime', 'admissionid', 'lengthofstay']

drop_columns = ["dateofdeath_delta","admittedat_delta","admissionid","origin",
                "lengthofstay","destination","weightgroup","agegroup","dateofdeath",
                "admittedat","heightgroup","specialty","dateofdeath_delta","admittedat_delta","weightsource","dischargedat","heightsource",
                "gender","Mortality","AF_orig","AF_measuredat","AF","new_onset_AF","Preadmission_AF","patientid","location","admissionyeargroup"]

mimic_drop_columns.append('date_corresponds_to_AF_admid')
drop_columns.append('date_corresponds_to_AF_admid')

In [179]:
X_train_or = train_AF_dataset#[train_AF_dataset.columns.drop(drop_columns)]
X_test_or = test_AF_dataset#[test_AF_dataset.columns.drop(mimic_drop_columns)]
y_train = train_AF_dataset.AF
y_test = test_AF_dataset.AF

In [180]:
X_train_or = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))]#[train_AF_dataset.columns.drop(drop_columns)]
X_test_or = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))]#[test_AF_dataset.columns.drop(mimic_drop_columns)]
y_train = train_AF_dataset[(train_AF_dataset.AF==1)|((train_AF_dataset.AF_orig==0)&(train_AF_dataset.AF==0))].AF
y_test = test_AF_dataset[(test_AF_dataset.AF==1)|((test_AF_dataset.AF_orig==0)&(test_AF_dataset.AF==0))].AF

X_test_or_amst = test_AF_dataset_amst[(test_AF_dataset_amst.AF==1)|((test_AF_dataset_amst.AF_orig==0)&(test_AF_dataset_amst.AF==0))]#[test_AF_dataset.columns.drop(mimic_drop_columns)]
y_test_amst = test_AF_dataset_amst[(test_AF_dataset_amst.AF==1)|((test_AF_dataset_amst.AF_orig==0)&(test_AF_dataset_amst.AF==0))].AF

### Testing

In [181]:
if one_half_hour_model:
    feature_columns = ['Age',
        'mean_Lactaat (bloed)',
        'slope_Hartfrequentie',
        'mean_UrineCAD',
        'mean_O2-Saturatie (bloed)',
        'max_PO2 (bloed)',
        'mean_O2 concentratie (Set)',
        'urgency',
        "min_Thrombo's (bloed)",
        'is_given_Noradrenaline (Norepinefrine)',
        'min_Ureum (bloed)',
        'is_given_Furosemide (Lasix)',
        'min_Act.HCO3 (bloed)',
        'Weight',
        'mean_PEEP (Set)',
        'fluid_balance',
        'mean_CVD',
        'slope_ABP systolisch',
        'min_pH (bloed)',
        'slope_ABP gemiddeld',
        ]

elif six_hour_model:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'is_given_Noradrenaline (Norepinefrine)',
        'mean_Lactaat (bloed)',
        'min_UrineCAD',
        'mean_O2 concentratie (Set)',
        'max_PO2 (bloed)',
        'min_CVD',
        'is_given_Furosemide (Lasix)',
        'fluid_balance',
        'min_ABP gemiddeld',
        'mean_PEEP (Set)',
        'mean_Act.HCO3 (bloed)',
        'min_ABP systolisch',
        'urgency',
        'slope_B.E. (bloed)']

else:
    feature_columns = ['Age',
        'slope_Hartfrequentie',
        'mean_O2 concentratie (Set)',
        'mean_UrineCAD',
        'max_PO2 (bloed)',
        'max_CVD',
        'mean_Lactaat (bloed)',
        'min_pH (bloed)',
        'mean_O2-Saturatie (bloed)',
        'urgency',
        'mean_PEEP (Set)']


X_train=X_train_or[feature_columns]    
X_test=X_test_or[feature_columns]   

X_test_amst = X_test_or_amst[feature_columns]

In [182]:
AF_class_balance = [y_train.sum()/len(y_train),1-y_train.sum()/len(y_train)]
if one_half_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, l2_leaf_reg=6, class_weights=AF_class_balance)
elif six_hour_model:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=400, depth=5, l2_leaf_reg=2, class_weights=AF_class_balance)
else:
    CB_AF = CatBoostClassifier(
        verbose=100, iterations=300, depth=4, l2_leaf_reg=2, class_weights=AF_class_balance)

CB_AF.fit(X_train,y_train)

0:	learn: 0.6866957	total: 1.72ms	remaining: 686ms
100:	learn: 0.5270815	total: 151ms	remaining: 448ms
200:	learn: 0.4859056	total: 299ms	remaining: 296ms
300:	learn: 0.4537133	total: 446ms	remaining: 147ms
399:	learn: 0.4199914	total: 588ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x16e7701dd30>

In [183]:
print(classification_report(y_train,CB_AF.predict(X_train),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_train,label=y_train), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_train,CB_AF.predict(X_train)),4)))
print(50*"=")

              precision    recall  f1-score   support

       No-AF       0.84      0.81      0.82      1198
          AF       0.81      0.84      0.83      1198

    accuracy                           0.82      2396
   macro avg       0.82      0.82      0.82      2396
weighted avg       0.82      0.82      0.82      2396



AUC = 0.9142
MCC = 0.6489


In [184]:
print("BALANCED TEST PATIENTS")
print(classification_report(y_test,CB_AF.predict(X_test),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test,label=y_test), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test,CB_AF.predict(X_test)),4)))
print(50*"=")

BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.61      0.83      0.71       981
          AF       0.74      0.48      0.58       981

    accuracy                           0.65      1962
   macro avg       0.68      0.65      0.64      1962
weighted avg       0.68      0.65      0.64      1962



AUC = 0.7429
MCC = 0.3309


In [185]:
print("BALANCED amst TEST PATIENTS")
print(classification_report(y_test_amst,CB_AF.predict(X_test_amst),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_amst,label=y_test_amst), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_amst,CB_AF.predict(X_test_amst)),4)))
print(50*"=")

BALANCED amst TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.71      0.68      0.69       300
          AF       0.69      0.72      0.71       300

    accuracy                           0.70       600
   macro avg       0.70      0.70      0.70       600
weighted avg       0.70      0.70      0.70       600



AUC = 0.7697
MCC = 0.4003


In [186]:
direct_y_test = y_test
direct_predict = CB_AF.predict(X_test)
AMST_y_test = y_test_amst
AMST_predict = CB_AF.predict(X_test_amst)

In [187]:
# plt.hist(CB_AF.predict(X_test)[np.where(y_test==0)],density=True)

In [188]:
temp_large_db = AF_dataset_mimic[((AF_dataset_mimic.AF==0)&(~AF_dataset_mimic.patientid.isin(AF_NOW_mimic.patientid)))|(AF_dataset_mimic.patientid.isin(test_AF_dataset.patientid))]

y_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))].AF
X_test_all_pat = temp_large_db[(temp_large_db.AF==1)|((temp_large_db.AF_orig==0)&(temp_large_db.AF==0))][feature_columns]


print("ALL PATIENTS")
print(classification_report(y_test_all_pat,CB_AF.predict(X_test_all_pat),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test_all_pat,label=y_test_all_pat), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test_all_pat,CB_AF.predict(X_test_all_pat)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("all patients",X_test_all_pat,y_test_all_pat,CB_AF,mimic_result_dataframe)


print("BALANCED TEST PATIENTS")
print(classification_report(y_test,CB_AF.predict(X_test),target_names=["No-AF","AF"]))
print("\n")
(fpr, tpr, thresholds) = get_roc_curve(CB_AF, Pool(data=X_test,label=y_test), plot=False)
print("AUC = "+str(np.round(auc(fpr,tpr),4)))
print("MCC = "+str(np.round(matthews_corrcoef(y_test,CB_AF.predict(X_test)),4)))
print(50*"=")

mimic_result_dataframe = pandas_result_AF_maker("balanced test",X_test,y_test,CB_AF,mimic_result_dataframe)

ALL PATIENTS
              precision    recall  f1-score   support

       No-AF       0.98      0.81      0.89     45814
          AF       0.06      0.46      0.10      1144

    accuracy                           0.81     46958
   macro avg       0.52      0.64      0.50     46958
weighted avg       0.96      0.81      0.87     46958



AUC = 0.7132
MCC = 0.1073
BALANCED TEST PATIENTS
              precision    recall  f1-score   support

       No-AF       0.61      0.83      0.71       981
          AF       0.74      0.48      0.58       981

    accuracy                           0.65      1962
   macro avg       0.68      0.65      0.64      1962
weighted avg       0.68      0.65      0.64      1962



AUC = 0.7429
MCC = 0.3309


# P-tests

In [189]:
mimic_result_dataframe

Unnamed: 0,model,validation_type,patient_group,NO AF patients,AF patients,NO AF recall,AF recall,NO AF precision,AF precision,NO AF f1,AF f1,mcc,auc
0,1.5,transfer,all patients,52637,1365,0.681137,0.70989,0.989076,0.054582,0.806719,0.10137,0.130658,0.768164
0,1.5,transfer,balanced test,1393,1365,0.661881,0.70989,0.699545,0.672917,0.680192,0.690909,0.372116,0.758558
0,1.5,combo,all patients mimic,52637,1365,0.684765,0.70696,0.989024,0.054961,0.809241,0.101992,0.131263,0.768894
0,1.5,combo,balanced test mimic,1393,1365,0.677674,0.70696,0.702381,0.682461,0.689806,0.694494,0.384738,0.758897
0,1.5,combo,all patients amsterdam,13191,344,0.646805,0.75,0.990021,0.052471,0.782429,0.09808,0.12985,0.795181
0,1.5,combo,balanced test amsterdam,1393,1365,0.677674,0.70696,0.702381,0.682461,0.689806,0.694494,0.384738,0.758897
0,1.5,direct,all patients,45349,1251,0.726874,0.589928,0.984676,0.056233,0.836359,0.102678,0.113841,0.735678
0,1.5,direct,balanced test,1070,1070,0.76729,0.602804,0.658909,0.721477,0.708981,0.656823,0.375204,0.760183
0,12.0,transfer,all patients,47109,1047,0.665839,0.724928,0.990902,0.045997,0.796481,0.086506,0.120079,0.755526
0,12.0,transfer,balanced test,1044,1047,0.667625,0.724928,0.707614,0.686257,0.687038,0.705063,0.393211,0.760111


In [190]:
# transfer_y_test = y_test
# transfer_predict = CB_AF.predict(X_test)

# combo_mimic_y_test = y_test_mimic
# combo_mimic_predict = CB_AF.predict(X_test_mimic)
# combo_amst_y_test = y_test_amst
# combo_amst_predict = CB_AF.predict(X_test_amst)

# direct_y_test = y_test
# direct_predict = CB_AF.predict(X_test)
# AMST_y_test = y_test_amst
# AMST_predict = CB_AF.predict(X_test_amst)

In [191]:
#bonferri correction: alpha/m, with m the amount of total tests
from scipy.stats import ranksums

# print("p-value comparisons")
# print(100*"=")
# print("")
    

# print("Direct - Internal")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(direct_predict,AMST_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(direct_predict[np.where(direct_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(direct_predict[np.where(direct_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Direct - Internal","all",ranksums(direct_predict,AMST_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Direct - Internal","AF",ranksums(direct_predict[np.where(direct_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Direct - Internal","NO AF",ranksums(direct_predict[np.where(direct_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Combo mimic - Internal ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict,AMST_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Internal","all",ranksums(combo_mimic_predict,AMST_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Internal","AF",ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Internal","NO AF",ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Combo amst - Internal ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(combo_amst_predict,AMST_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(combo_amst_predict[np.where(combo_amst_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(combo_amst_predict[np.where(combo_amst_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo amst - Internal","all",ranksums(combo_amst_predict,AMST_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo amst - Internal","AF",ranksums(combo_amst_predict[np.where(combo_amst_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo amst - Internal","NO AF",ranksums(combo_amst_predict[np.where(combo_amst_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Transfer - Internal ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict,AMST_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Internal","all",ranksums(transfer_predict,AMST_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Internal","AF",ranksums(transfer_predict[np.where(transfer_y_test==1)],AMST_predict[np.where(AMST_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Internal","NO AF",ranksums(transfer_predict[np.where(transfer_y_test==0)],AMST_predict[np.where(AMST_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Transfer - Combo mimic ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict,combo_mimic_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==1)],combo_mimic_predict[np.where(combo_mimic_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==0)],combo_mimic_predict[np.where(combo_mimic_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Combo mimic","all",ranksums(transfer_predict,combo_mimic_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Combo mimic","AF",ranksums(transfer_predict[np.where(transfer_y_test==1)],combo_mimic_predict[np.where(combo_mimic_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Combo mimic","NO AF",ranksums(transfer_predict[np.where(transfer_y_test==0)],combo_mimic_predict[np.where(combo_mimic_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Transfer - Direct ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict,direct_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==1)],direct_predict[np.where(direct_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(transfer_predict[np.where(transfer_y_test==0)],direct_predict[np.where(direct_y_test==0)])[1]))
# print(40*"=")
# print("")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Direct","all",ranksums(transfer_predict,direct_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Direct","AF",ranksums(transfer_predict[np.where(transfer_y_test==1)],direct_predict[np.where(direct_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Transfer - Direct","NO AF",ranksums(transfer_predict[np.where(transfer_y_test==0)],direct_predict[np.where(direct_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])


# print("Combo mimic - Direct ")
# print("")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict,direct_predict)[1]))
# print(10*"-")
# print("AF ")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==1)],direct_predict[np.where(direct_y_test==1)])[1]))
# print(10*"-")
# print("NO AF")
# print("Rank-sum p-value: "+ str(ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==0)],direct_predict[np.where(direct_y_test==0)])[1]))
# print(40*"=")
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Direct","all",ranksums(combo_mimic_predict,direct_predict)[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Direct","AF",ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==1)],direct_predict[np.where(direct_y_test==1)])[1]]],columns=mimic_p_value_result_dataframe_columns)])
mimic_p_value_result_dataframe = pd.concat([mimic_p_value_result_dataframe,pd.DataFrame(data=[[model_string,"Combo mimic - Direct","NO AF",ranksums(combo_mimic_predict[np.where(combo_mimic_y_test==0)],direct_predict[np.where(direct_y_test==0)])[1]]],columns=mimic_p_value_result_dataframe_columns)])



In [192]:
mimic_p_value_result_dataframe

Unnamed: 0,model,comparison,AF_group,p_value
0,1.5,Direct - Internal,all,4.089310e-06
0,1.5,Direct - Internal,AF,5.840737e-05
0,1.5,Direct - Internal,NO AF,1.260765e-02
0,1.5,Combo mimic - Internal,all,3.341312e-01
0,1.5,Combo mimic - Internal,AF,2.278346e-01
...,...,...,...,...
0,6,Transfer - Direct,AF,8.460366e-17
0,6,Transfer - Direct,NO AF,5.418498e-11
0,6,Combo mimic - Direct,all,1.834078e-26
0,6,Combo mimic - Direct,AF,8.460366e-17


In [193]:
# mimic_p_value_result_dataframe.to_csv("../results/mimic_p_value_result_dataframe.csv",index=False)
# mimic_result_dataframe.to_csv("../results/mimic_result_dataframe.csv",index=False)

# Patient Demographics MIMIC 

In [4]:
admissions_pd_mimic = pd.read_csv(mimic_extracted_path+"df_adm_icu.csv",sep=',')
admissions_pd_mimic = admissions_pd_mimic.rename(columns={"subject_id":"patientid","stay_id":"admissionid","los":"lengthofstay"})
admissions_pd_mimic.loc[:,"intime"] = pd.to_datetime(admissions_pd_mimic.intime)
admissions_pd_mimic.loc[:,"outtime"] = pd.to_datetime(admissions_pd_mimic.outtime)
admissions_pd_mimic.loc[:,"lengthofstay"]=admissions_pd_mimic.lengthofstay*24
admissions_pd_mimic.loc[admissions_pd_mimic.AF_measuredat=="0","AF_measuredat"]=0.0
admissions_pd_mimic.loc[admissions_pd_mimic.AF_measuredat!=0,"AF_measuredat"]=(pd.to_datetime(admissions_pd_mimic[admissions_pd_mimic.AF_measuredat!=0].AF_measuredat)-pd.to_datetime(admissions_pd_mimic[admissions_pd_mimic.AF_measuredat!=0].intime)).dt.total_seconds()/60/60
admissions_pd_mimic["AF_measuredat"] = np.float32(admissions_pd_mimic["AF_measuredat"])
admissions_pd_mimic = admissions_pd_mimic[admissions_pd_mimic.lengthofstay>=12]

weight_csv = pd.read_csv(mimic_base_path+"icu/csv/chartevents/"+str(226512)+".csv")[["stay_id","valuenum"]]
weight_csv = weight_csv.rename(columns={"subject_id":"patientid","stay_id":"admissionid","valuenum":"weight"})
height_csv = pd.read_csv(mimic_base_path+"icu/csv/chartevents/"+str(226730)+".csv")[["stay_id","valuenum"]]
height_csv = height_csv.rename(columns={"subject_id":"patientid","stay_id":"admissionid","valuenum":"height"})


admissions_pd_mimic = admissions_pd_mimic.merge(height_csv,how='left',on='admissionid')
admissions_pd_mimic.loc[(admissions_pd_mimic.height<50)|(admissions_pd_mimic.height>250)]=int(admissions_pd_mimic[(admissions_pd_mimic.height<50)|(admissions_pd_mimic.height>250)].height.mean())
admissions_pd_mimic = admissions_pd_mimic.merge(weight_csv,how='left',on='admissionid')
admissions_pd_mimic.loc[(admissions_pd_mimic.weight<20)|(admissions_pd_mimic.weight>500)]=int(admissions_pd_mimic[(admissions_pd_mimic.weight<20)|(admissions_pd_mimic.weight>500)].weight.mean())
admissions_pd_mimic["BMI"]=admissions_pd_mimic.weight/((admissions_pd_mimic.height/100)**2)
    
df_patients = pd.read_csv(mimic_base_path+"core/csv/patients.csv",sep=',')
df_patients = df_patients.rename(columns={"subject_id":"patientid","anchor_age":"Age"})

admissions_pd_mimic = admissions_pd_mimic.merge(df_patients[['patientid','Age','gender']],how='left',on='patientid')
#If no gender was specified, the value is -1.
gender_category = {'M':1,"F":0,"":-1}

#Replace the strings with numbers
admissions_pd_mimic["gender"]=admissions_pd_mimic["gender"].replace(gender_category)

admissions_pd_mimic = admissions_pd_mimic[(admissions_pd_mimic.AF==0)|((admissions_pd_mimic.AF==1)&(admissions_pd_mimic.AF_measuredat>=12))]

print(len(admissions_pd_mimic.patientid.unique()))

46234


In [5]:
sofa_mimic_csv = pd.read_csv(mimic_base_path+"mimic_sofa.csv")[["stay_id","sofa_24hours","starttime","endtime"]]
sofa_mimic_csv = sofa_mimic_csv.rename(columns={"stay_id":"admissionid","sofa_24hours":"sofa"})
sofa_mimic_csv.loc[:,"starttime"]=pd.to_datetime(sofa_mimic_csv.starttime)
sofa_mimic_csv.loc[:,"endtime"]=pd.to_datetime(sofa_mimic_csv.endtime)
sofa_mimic_csv = sofa_mimic_csv.merge(admissions_pd_mimic[["admissionid","patientid","intime","outtime","AF_measuredat","AF"]],how="inner",on="admissionid")
sofa_mimic_csv["time_to_adm"] = (pd.to_datetime(sofa_mimic_csv.starttime)-pd.to_datetime(sofa_mimic_csv.intime)).dt.total_seconds()/60/60
sofa_mean_day_mimic = sofa_mimic_csv[sofa_mimic_csv.time_to_adm<=24][["patientid","admissionid","sofa"]].groupby(["patientid","admissionid"]).mean().reset_index()
sofa_mean_day_mimic = sofa_mean_day_mimic.rename(columns={"sofa":"sofa_first24h"})
sofa_AF_mimic = sofa_mimic_csv[(sofa_mimic_csv.AF==1)&(sofa_mimic_csv.AF_measuredat - sofa_mimic_csv.time_to_adm <= 1)&(sofa_mimic_csv.AF_measuredat - sofa_mimic_csv.time_to_adm >= 0)][["patientid","admissionid","sofa"]].groupby(["patientid","admissionid"]).mean().reset_index()
sofa_AF_mimic = sofa_AF_mimic.rename(columns={"sofa":"sofa_AF_matched"})
admissions_pd_mimic = admissions_pd_mimic.merge(sofa_mean_day_mimic,how='left',on=['patientid','admissionid'])
admissions_pd_mimic = admissions_pd_mimic.merge(sofa_AF_mimic,how='left',on=['patientid','admissionid'])

In [6]:
admissions_pd_amst = pd.read_csv(amsterdam_data_path + "admissions_demographics.csv")
admissions_pd_amst = admissions_pd_amst[admissions_pd_amst.lengthofstay>=12]
admissions_pd_amst.loc[admissions_pd_amst.Weight==0] = admissions_pd_amst[admissions_pd_amst.Weight>0].Weight.mean()
admissions_pd_amst.loc[admissions_pd_amst.Height==0] = admissions_pd_amst[admissions_pd_amst.Height>0].Height.mean()
admissions_pd_amst["BMI"]=admissions_pd_amst.Weight/((admissions_pd_amst.Height/100)**2)
admissions_pd_amst["AF_measuredat"] = admissions_pd_amst["AF_measuredat"]/1000/60/60
admissions_pd_amst = admissions_pd_amst[(admissions_pd_amst.AF==0)|((admissions_pd_amst.AF==1)&(admissions_pd_amst.AF_measuredat>=12))]

print(len(admissions_pd_amst.patientid.unique()))

amst_sofa = pd.read_csv(amsterdam_data_path + "Data/sofa.csv",sep=',')
amst_apache_II = pd.read_csv(amsterdam_data_path + "Data/apache_ii.csv",sep=',')
admissions_pd_amst = admissions_pd_amst.merge(amst_sofa[["admissionid","sofa_total_score"]],how='left',on='admissionid')
admissions_pd_amst = admissions_pd_amst.merge(amst_apache_II[["admissionid","apache_ii_total_score"]],how='left',on='admissionid')

16995


In [7]:
print(len(admissions_pd_mimic.patientid.unique()))
print(len(admissions_pd_amst.patientid.unique()))
print(len(admissions_pd_mimic[admissions_pd_mimic.AF==1].patientid.unique()))
print(len(admissions_pd_amst[admissions_pd_amst.AF==1].patientid.unique()))
print(len(admissions_pd_mimic[admissions_pd_mimic.AF==0].patientid.unique()))
print(len(admissions_pd_amst[admissions_pd_amst.AF==0].patientid.unique()))

46234
16995
5165
1868
42638
15450


In [63]:
print("AMSTERDAM COLUMNS")
print("")
print(admissions_pd_amst.columns)
print(50*"-")
print("")
print("MIMIC COLUMNS")
print("")
print(admissions_pd_mimic.columns)

AMSTERDAM COLUMNS

Index(['patientid', 'admissionid', 'admissioncount', 'location', 'urgency',
       'origin', 'admittedat', 'admissionyeargroup', 'dischargedat',
       'lengthofstay', 'destination', 'gender', 'agegroup', 'dateofdeath',
       'weightgroup', 'weightsource', 'heightgroup', 'heightsource',
       'specialty', 'dateofdeath_delta', 'admittedat_delta', 'Height', 'Age',
       'Weight', 'Gender_category', 'Mortality', 'sepsis_bool', 'neuro_bool',
       'cardiac_surg_bool', 'cardio_surgery_new', 'AF', 'new_onset_AF',
       'Preadmission_AF', 'AF_measuredat', 'BMI', 'sofa_total_score',
       'apache_ii_total_score'],
      dtype='object')
--------------------------------------------------

MIMIC COLUMNS

Index(['patientid', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime',
       'edouttime', 'hospital_expire_flag', 'admissionid'

In [80]:
admissions_pd_mimic[admissions_pd_mimic.AF==1].admission_type.value_counts()

EW EMER.                       2325
URGENT                         1292
SURGICAL SAME DAY ADMISSION     695
OBSERVATION ADMIT               623
ELECTIVE                        362
DIRECT EMER.                    240
EU OBSERVATION                    2
Name: admission_type, dtype: int64

In [81]:
(1292+623)/len(admissions_pd_mimic[admissions_pd_mimic.AF==1])*100

12.54739122585304

In [75]:
admissions_pd_amst[admissions_pd_amst.AF==0].specialty.value_counts()

Cardiochirurgie                 6172
Neurochirurgie                  2092
Vaatchirurgie                   1019
Traumatologie                    931
Inwendig                         864
0                                815
Heelkunde Gastro-enterologie     793
Cardiologie                      733
Intensive Care Volwassenen       702
Neurologie                       578
Heelkunde Oncologie              374
Longziekte                       310
Nefrologie                       288
Keel, Neus & Oorarts             284
Heelkunde Longen/Oncologie       266
Urologie                         145
Hematologie                      143
Orthopedie                       134
Gynaecologie                     132
Maag-,Darm-,Leverziekten         119
Oncologie Inwendig                64
ders                              45
Plastische chirurgie              32
Mondheelkunde                     17
Verloskunde                       11
Obstetrie                         10
Oogheelkunde                       2
R

In [77]:
(6172+2092+1019+32)/len(admissions_pd_amst[admissions_pd_amst.AF==0])*100

54.55024595924104

In [266]:
admissions_pd_mimic.admission_type.value_counts()

EW EMER.                       33444
URGENT                         10756
OBSERVATION ADMIT               7707
SURGICAL SAME DAY ADMISSION     6761
DIRECT EMER.                    2416
ELECTIVE                        2195
EU OBSERVATION                   293
DIRECT OBSERVATION               113
AMBULATORY OBSERVATION            12
Name: admission_type, dtype: int64

In [256]:
admissions_pd_mimic[admissions_pd_mimic.AF>=0].drop_duplicates("patientid").ethnicity.value_counts()

WHITE                            30782
UNKNOWN                           4956
BLACK/AFRICAN AMERICAN            4437
OTHER                             2259
HISPANIC/LATINO                   1705
ASIAN                             1365
UNABLE TO OBTAIN                   652
AMERICAN INDIAN/ALASKA NATIVE       78
Name: ethnicity, dtype: int64

In [18]:
print(admissions_pd_amst[admissions_pd_amst.AF==0].cardio_surgery_new.sum())
print(admissions_pd_amst[admissions_pd_amst.AF==0].sepsis_bool.sum())
admissions_pd_amst[admissions_pd_amst.AF==0].drop_duplicates("patientid").describe()

4532.0
1835.0


Unnamed: 0,patientid,admissionid,admissioncount,urgency,admittedat,dischargedat,lengthofstay,dateofdeath,Height,Age,Weight,Gender_category,Mortality,sepsis_bool,neuro_bool,cardiac_surg_bool,cardio_surgery_new,AF,new_onset_AF,Preadmission_AF,AF_measuredat,BMI,sofa_total_score,apache_ii_total_score
count,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,15450.0,0.0,15450.0,15450.0,15450.0
mean,10177.41055,11787.882783,1.019482,0.26,468266900.0,735690300.0,74.293981,18204640000.0,175.159871,61.450485,80.352751,0.651586,0.25534,0.108414,0.025696,0.315275,0.287249,0.0,0.0,0.0,,26.177913,5.633657,16.271942
std,5867.151499,6801.20593,0.148162,0.438648,8272208000.0,8288313000.0,151.841983,56297360000.0,9.51645,14.379398,14.567641,0.476483,0.436066,0.310913,0.158231,0.46464,0.452493,0.0,0.0,0.0,,4.30828,3.320151,6.555675
min,0.0,0.0,1.0,0.0,0.0,39780000.0,12.0,-138294400000.0,155.0,35.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,16.070124,0.0,0.0
25%,5106.25,5891.25,1.0,0.0,0.0,72900000.0,20.0,0.0,165.0,55.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,23.875115,3.0,12.0
50%,10174.5,11786.5,1.0,0.0,0.0,86580000.0,24.0,0.0,175.0,65.0,75.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,24.835646,5.0,15.5
75%,15277.75,17694.75,1.0,1.0,0.0,211185000.0,54.75,26730000.0,185.0,75.0,85.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,27.757487,7.0,19.5
max,20326.0,23552.0,4.0,1.0,302560300000.0,302639000000.0,4850.0,451598800000.0,195.0,85.0,115.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,,47.866805,23.0,46.5


In [19]:
print(admissions_pd_mimic[admissions_pd_mimic.AF==0].cardiac_bool_new.sum())
print(admissions_pd_mimic[admissions_pd_mimic.AF==0].sepsis_bool.sum())
admissions_pd_mimic[admissions_pd_mimic.AF==0].drop_duplicates("patientid").describe()

7795
25657


Unnamed: 0,patientid,hadm_id,hospital_expire_flag,admissionid,lengthofstay,admissioncount,AF,AF_measuredat,sepsis_bool,surgery_bool,cardiac_bool,medical_bool,cardiac_bool_new,height,weight,BMI,Age,gender,sofa_first24h,sofa_AF_matched
count,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,42638.0,20832.0,41618.0,20706.0,42638.0,42638.0,42582.0,0.0
mean,14986130.0,24963660.0,0.079624,34987130.0,73.58019,0.039519,0.0,0.0,0.430602,0.396149,0.260753,0.658098,0.15573,169.192207,81.258499,29.470176,61.242718,0.558141,3.261266,
std,2885215.0,2886143.0,0.270713,2889631.0,101.831461,0.241681,0.0,0.0,0.495166,0.489102,0.439051,0.474352,0.362603,12.854252,22.72901,13.320304,17.134181,0.496614,2.685457,
min,10001220.0,20000150.0,0.0,30000150.0,12.003611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,20.5,8.425926,18.0,0.0,0.0,
25%,12472360.0,22464440.0,0.0,32472620.0,26.308472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,163.0,65.9,24.056935,51.0,0.0,1.0,
50%,14982890.0,24931090.0,0.0,34986490.0,43.157361,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,170.0,78.4,27.587326,63.0,1.0,2.68,
75%,17479060.0,27462600.0,0.0,37480960.0,75.436319,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,178.0,93.0,32.098765,74.0,1.0,4.68,
max,19999990.0,29999830.0,1.0,39999810.0,2645.574722,8.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,241.0,345.0,322.265625,91.0,1.0,20.8,


In [12]:
5165*0.692352

3575.99808

In [251]:
admissions_pd_mimic.AF_measuredat

0        0
1        0
2        0
3        0
4        0
        ..
73296    0
73297    0
73298    0
73299    0
73300    0
Name: AF_measuredat, Length: 63697, dtype: object

In [238]:
1-0.519807

0.480193

In [239]:
1868*0.480193

897.0005239999999

In [70]:
from scipy import stats

stats.ttest_ind(admissions_pd_amst[admissions_pd_amst.AF==0].sofa_total_score, admissions_pd_mimic[(admissions_pd_mimic.AF==0)&(~admissions_pd_mimic.sofa_first24h.isna())].sofa_first24h)

Ttest_indResult(statistic=89.04583511565711, pvalue=0.0)

In [177]:
stats.ranksums(admissions_pd_amst[admissions_pd_amst.AF==0].Gender_category, admissions_pd_mimic[admissions_pd_mimic.AF==0].gender)

RanksumsResult(statistic=19.22334668618072, pvalue=2.3604300563401304e-82)