# Clean validation RKZ data

- recode missing values
- remove patients with missing GAS
- encode anatomical location (combine left, right)
- prepare ENTRY and PRESURGERY data 

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
sns.set_theme(style="whitegrid")


PATH_base = "/home/WUR/katz001/PROJECTS/permit-nsti-gas"

# Load data

In [2]:
df_prim = pd.read_csv("../../data/validation/raw_data/Datafile_NSTI_216primair_tavGASpredictions.csv", na_values=" ")
df_prim.shape

(216, 354)

# Recode missing variables to NaN

In [3]:
### recode missing measurements as missing values
na_other = {-99:np.nan,
            -98:np.nan,
            -97:np.nan,
            -96:np.nan,
            -95:np.nan,
            2:np.nan}
df_prim.replace(na_other, inplace=True)

# Remove patients with missing target label

In [4]:
print(df_prim["GAS_yes_no"].isna().sum())

df_prim = df_prim[df_prim["GAS_yes_no"].notna()]
df_prim.shape

8


(208, 354)

# Encode body parts (merge left, right)

In [5]:
df_prim['bodypart_adm_pres#Lower_arm'] = df_prim.loc[:,['bodypart_adm_pres#Left_Lower_arm',
                                                        'bodypart_adm_pres#Right_Lower_arm']].max(axis=1)

df_prim['bodypart_adm_pres#Upper_arm'] = df_prim.loc[:,['bodypart_adm_pres#Left_Upper_arm',
                                                        'bodypart_adm_pres#Right_Upper_arm']].max(axis=1)
df_prim.shape

(208, 356)

# Prepare ENTRY and PRESURGERY dataset

In [7]:
with open(f"{PATH_base}/data/validation/variables_translation.json", "r") as f: varTranslation = json.load(f)

''' ENTRY '''
datasetTimepoint = "ENTRY"
fs_entry = pd.read_csv(f"{PATH_base}/results/20_featureSelection/{datasetTimepoint}/CV/Conclusion_micro_bootstrapped_iterativeBoruta_100perc.txt", 
                       header=None)[0].tolist()
fs_entry_validationDataset = [varTranslation[ele] for ele in fs_entry] + ["GAS_yes_no"]
print(fs_entry_validationDataset)
df_entry_val = df_prim.loc[:,fs_entry_validationDataset].copy()
print(df_entry_val.shape)


''' PRESURGERY '''
datasetTimepoint = "PRESURGERY"
fs_presurgery = pd.read_csv(f"{PATH_base}/results/20_featureSelection/{datasetTimepoint}/CV/Conclusion_micro_bootstrapped_iterativeBoruta_100perc.txt", 
                       header=None)[0].tolist()
fs_presurgery_validationDataset = [varTranslation[ele] for ele in fs_presurgery] + ["GAS_yes_no"]
print(fs_presurgery_validationDataset)
df_presurgery_val = df_prim.loc[:,fs_presurgery_validationDataset].copy()
print(df_presurgery_val.shape)


['age_pres', 'bodypart_adm_pres#Upper_arm', 'bodypart_adm_pres#Lower_arm', 'bodypart_adm_pres#Anogenital_region', 'surg_area_NSTI', 'Diabetes_M', 'GAS_yes_no']
(208, 7)
['Adm_pres_Creat', 'Adm_pres_Hb_alt', 'bodypart_adm_pres#Upper_arm', 'bodypart_adm_pres#Lower_arm', 'bodypart_adm_pres#Anogenital_region', 'Diabetes_M', 'surg_area_NSTI', 'age_pres', 'GAS_yes_no']
(208, 9)


# Save validation data

In [8]:
### Save dataset ###
print("Saving dataset...\n\n")
df_entry_val.to_csv(f"{PATH_base}/results/10_preprocessed/validation/ENTRY_Conclusion_micro_validationData.csv", index=False)
df_presurgery_val.to_csv(f"{PATH_base}/results/10_preprocessed/validation/PRESURGERY_Conclusion_micro_validationData.csv", index=False)

Saving dataset...


