In [52]:
import pandas as pd
import numpy as np
import os
import json
import sys
import argparse

# Load data

In [53]:
target = "Conclusion_micro"

path_data = "../../results/00_datasets"
!mkdir -p ../../results/10_preprocessed

### Read time-dissected datasets
sets = "BL_dataset.csv"

### Load datasets
data = pd.read_csv(os.path.join(path_data, "time_dissection", sets), low_memory=False)
print(f"Reading dataset: {sets} of size {data.shape}")

### Read in TARGET
data_target = pd.read_csv(f"{path_data}/TARGET_dataset.csv", low_memory=False)

### Concat TARGET
print(f"Adding target endpoint to dataset...")
data_target2 = data_target[["PATIENT_ID",target]]
data = pd.merge(data, data_target2, on='PATIENT_ID', how = "inner")


print(data[target].value_counts())

Reading dataset: BL_dataset.csv of size (409, 877)
Adding target endpoint to dataset...
Conclusion_micro
poly                 146
mono_GAS             114
mono_gramneg_aero     18
poly_STAU             13
poly_GBS              13
mono_STAU             13
poly_GAS              12
mono_CLOST            10
mono_GCS               8
poly_CLOST             7
poly_GGS               7
mono_GGS               7
poly_GCS               6
mono_strep_other       4
mono_anaerob           3
mono_GBS               2
Name: count, dtype: int64


### Binarize 'Conclusion_micro' to only GAS vs non-GAS

In [54]:
# 2 catagories: GAS, non-GAS (="other")
conc_micro_dic = {"GAS": ["mono_GAS", "poly_GAS"],
                  "OTHER":["poly","poly_CLOST","poly_STAU",'poly_C+LOST', 'poly_GBS', 'mono_gramneg_aero', 'mono_STAU', 'poly_GCS', 'mono_anaerob','mono_CLOST', "nan", 'mono_GCS', 'poly_GGS', 'mono_strep_other', 'mono_GGS', 'mono_GBS']}
for i in conc_micro_dic.keys():
    data.replace(conc_micro_dic[i], i, inplace=True)
data[target].fillna(value="OTHER", inplace=True)

### BINARIZE
data[target].replace("GAS", 1, inplace=True)
data[target].replace("OTHER", 0, inplace=True)

print(data.shape)
print(data[target].value_counts())

(409, 878)
Conclusion_micro
0    283
1    126
Name: count, dtype: int64


# Data cleaning

- Remove patients with NaN in target label
- Remove biasing/unnecessary labels (DATE/TIME columns, other_bact_sample_x, other_bact_blood_x)
- Clean up hospital names

In [55]:
df = data.copy()
print(target)

Conclusion_micro


### Remove patients with NaN in target label

In [56]:
### Remove patients with NaN in target label ###
pats0=df.shape[0]
df.dropna(axis=0, subset=[target], inplace=True)
print(f"Removed patients with NaN in target label: {pats0-df.shape[0]}")

Removed patients with NaN in target label: 0


### Remove biasing/unnecessary variables

In [57]:
print(f"Removing biasing/unnecessary variables:")
### remove DATE* and TIME* columns ###
vars0=df.shape[1]
removeLabels = ["date", "time", "hospital"]
before=df.columns[df.columns.str.contains('|'.join(removeLabels), case=False, regex=True)]
df.drop(df.columns[df.columns.str.contains('|'.join(removeLabels), case=False, regex=True)],
        axis = 1, inplace=True)
print(f"\t{before}")

### remove "other_bact_sample_x" ###
## --> decided it was too messy
before=df.columns[df.columns.str.contains("other_bact_sample", case=False)]
df.drop(df.columns[df.columns.str.contains("other_bact_sample", case=False)],
            axis=1, inplace = True)
#print(f"\t{before}")
### remove "other_bact_blood_x" ###
before=df.columns[df.columns.str.contains("other_bact_blood", case=False)]
df.drop(df.columns[df.columns.str.contains("other_bact_blood", case=False)],
        axis=1, inplace = True)
#print(f"\t{before}")
print(f"\tNumber of removed unnecessary variables: {vars0-df.shape[1]}")



Removing biasing/unnecessary variables:
	Index(['DATE_FIRST_ADMISSION', 'HOSPITAL_FORST_ADMIS', 'DATE_DIAGNOSIS',
       'DATE_SPEC_HOSP', 'DATE_ICU_ADMISSION', 'date_of_data_2',
       'date_specimen_sample_1', 'date_specimen_sample_2',
       'date_specimen_sample_3', 'date_specimen_sample_4',
       ...
       'hospital_surgery_8', 'hospital_surgery_9', 'hospital_surgery_10',
       'hospital_surgery_11', 'hospital_surgery_12', 'hospital_surgery_13',
       'hospital_surgery_14', 'hospital_surgery_15', 'HOSPITAL_PREOP',
       'TIME_DISCHARGE'],
      dtype='object', length=103)
	Number of removed unnecessary variables: 103


### Remove variables with missingess > 5%

In [58]:
### Remove variables with missingess > 5 % ###
before=df.columns
var2 = df.shape[1]
miss_thresh = 0.1
dfbefore = df.copy()
df.dropna(axis=1, thresh=round(df.shape[0]*(1-miss_thresh)), inplace=True)
print(f"Removed variables with a missingess of > {miss_thresh}: {var2 - df.shape[1]}\n")
print(f"Removed variables: {[var for var in before if var not in df.columns]}")

Removed variables with a missingess of > 0.1: 13

Removed variables: ['ALCOHOL', 'SMOKING', 'SKIN_ANAESTHESIA_PREOP', 'CREPITUS_PREOP', 'GAS_RADIOLOGY_PREOP', 'lactate_preop', 'glucose_preop', 'hgb_preop_d', 'hgb_preop_n', 'hgb_preop_s', 'PCT_BL', 'fibrinogen_BL', 'd_dimer_BL']


## Inlcude septic shock

In [60]:
data_allVars = pd.read_csv(f"{path_data}/allFeatures_fullDataset.csv")

df["shock_BL"] = data_allVars.loc[df.index,"shock_BL"].copy()

  data_allVars = pd.read_csv(f"{path_data}/allFeatures_fullDataset.csv")


# Save Dataset

In [61]:
print(f"Final dataset size: {df.shape}")
### Save imputed dataset ###
print("Saving dataset...\n\n")
dataset=sets.replace(".", "_").split("_")[0]
df.to_csv(os.path.join(f"../../results/10_preprocessed/{dataset}_{target}_preprocessed.csv"), index=False)

Final dataset size: (409, 762)
Saving dataset...


