In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import os 
import copy
from fancyimpute import KNN 
from sklearn.model_selection import GroupShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
import joblib
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")

In [None]:
MIMIC_data = pd.read_csv('../commondata/MIMIC_data_allbins.csv')
print('total row and columns in MIMIC data', MIMIC_data.shape)
print('total patient in the file ', MIMIC_data.icustay_id.value_counts().shape[0])

### Discretize action values 

In [None]:
def confusion_matrix(df: pd.DataFrame, col1: str, col2: str):
    """
    Given a dataframe with at least
    two categorical columns, create a 
    confusion matrix of the count of the columns
    cross-counts
    
    use like:
    
    >>> confusion_matrix(test_df, 'actual_label', 'predicted_label')
    """
    return (
            df
            .groupby([col1, col2])
            .size()
            .unstack(fill_value=0)
            )


In [None]:
k2 = MIMIC_data[MIMIC_data.dbsource=='carevue']

In [None]:
confusion_matrix(k2, 'discrete_IV', 'discrete_VP')

In [None]:
#geting the quartile only based on metavision for IV and Vassopressor 
vp_quartile = pd.qcut(MIMIC_data.loc[(MIMIC_data.dbsource=='metavision') &(MIMIC_data['max_VP']!=0)]['max_VP'], q=4)
vp_quartile_bins = [-float("inf"), 0.009, 0.11, 0.225, 0.451, float("inf")] # I have checke dand this is the bin 
#discreteze vassopressor action based on values 
vp_discrete = pd.cut(MIMIC_data.max_VP, bins = [-float("inf"), 0.009, 0.11, 0.225, 0.451, float("inf")], labels = [0, 1, 2, 3, 4])

##get quartile for IV data 
iv_quartile =  pd.qcut(MIMIC_data.loc[(MIMIC_data.dbsource=='metavision') &(MIMIC_data['total_IV']!=0)]['total_IV'], q=4)
iv_quartile_bins = [-float("inf"), 0, 100, 292, 753, float("inf")] # putting infinity to define lower and upper bound categories 
#discretize IV 
iv_discrete = pd.cut(MIMIC_data.total_IV, bins = [-float("inf"), 0, 100, 292, 753, float("inf")], labels = [0, 1, 2, 3, 4])

### discretize action space 
action = vp_discrete.cat.codes * 5 + iv_discrete.cat.codes

## Replace value in MIMIC dataset 
MIMIC_data['discrete_IV'] = iv_discrete.cat.codes
MIMIC_data['discrete_VP'] = vp_discrete.cat.codes
MIMIC_data['discrete_action'] = action.values

In [None]:
### confusion matrix
# #carevue  
confusion_matrix(MIMIC_data[MIMIC_data.dbsource=='carevue'], 'discrete_IV', 'discrete_VP')

In [None]:
### confusion matrix
# #metavision 
confusion_matrix(MIMIC_data[MIMIC_data.dbsource=='metavision'], 'discrete_IV', 'discrete_VP')

##### Define features based on which we calculate missing values


In [None]:
dss_features = ['Gender','Ventilator', #binary features
                'Age','Weight','HeartRate','SYS','MAP','DIA','RespRate','Temp','FiO2',
                 'Kalium','Natrium','Chloride','Glucose','Magnesium','Calcium','ANION_GAP',
                 'HB','LEU','Trombo','APTT','Art_PH','PaO2','PaCO2','Height',
                 'Art_BE','Bicarbonaat','Lactate','Sofa_score','Sirs_score','Shock_Index',
                 'PF_ratio','Albumine', 'Ion_Ca', # normal. features
                'max_VP_prev','SpO2','Ureum','Creat','ALAT','ASAT','Bili','INR', #logfeatures
                'Running_total_IV','total_IV_prev','Running_total_UP','total_UP']


In [None]:
(MIMIC_data.isna().sum()/MIMIC_data.shape[0]).sort_values(ascending=False)

#### Remove low resolution patients data points

In [None]:
#first get columns which contains missing value 
miss_values= MIMIC_data[dss_features].isna().sum(axis=0).sort_values(ascending=False)
misscols = miss_values[miss_values>0].index
#filter 
MIMIC_data['Row_wise_missing'] = MIMIC_data[misscols].isna().sum(axis=1)/MIMIC_data[misscols].shape[1] * 100 #MIMIC_data.isna().sum(axis=1)/MIMIC_data.shape[1] * 100
#get miniumm traj % patient 
min_percent_missing_ptlist = MIMIC_data.loc[MIMIC_data.groupby('icustay_id')['Row_wise_missing'].idxmin()][['icustay_id','Row_wise_missing']]
discard_min_pt_list = min_percent_missing_ptlist[min_percent_missing_ptlist['Row_wise_missing']>20].icustay_id.tolist()
mean_NA_per_patient = MIMIC_data.groupby(['icustay_id'])['Row_wise_missing'].mean()
discard_mean_pt_list  = mean_NA_per_patient.index[mean_NA_per_patient>70].unique().tolist()
exclude_ptid_list = discard_min_pt_list+discard_mean_pt_list
print('total icustay to discard ', len(set(exclude_ptid_list)))
MIMIC_data = MIMIC_data[~MIMIC_data.icustay_id.isin(exclude_ptid_list)].reset_index(drop=True)

In [None]:
#drop trajectories where 80 % of key features are missing 
MIMIC_data = MIMIC_data[MIMIC_data.Row_wise_missing < 80].reset_index(drop=True)

In [None]:
#groupby farward fill 
MIMIC_data[dss_features]= MIMIC_data.groupby('icustay_id')[dss_features].transform(lambda x: x.ffill())
#discard row miss columns
MIMIC_data = MIMIC_data.drop(['Row_wise_missing'], axis=1).reset_index(drop=True)
## Remove patients which contains less than 3 trajectories 
trajcount = MIMIC_data.icustay_id.value_counts()
discard_p = trajcount[trajcount<3] #no traject to discard here 
len(discard_p)

In [None]:
### Remove care vue patients with action space 5, 10, 15, 20 
#only select patinet who does not have 5,10, 15, 20 action space (just like MV data)
carevue_id_to_discard = MIMIC_data.loc[MIMIC_data.discrete_action.isin([5,10,15,20]) & (MIMIC_data.dbsource=='carevue') ]['icustay_id'].value_counts()
print('total carevue discard', len(carevue_id_to_discard))
MIMIC_data = MIMIC_data[~MIMIC_data.icustay_id.isin(carevue_id_to_discard.index)].reset_index(drop=True)
MIMIC_data.icustay_id.value_counts().shape #16512 patients to include in the analysis 

In [None]:
MIMIC_data.shape

#### Cap values and get cummulative balance

In [None]:
caps = pd.read_csv('../commondata/capping_values.csv', sep=',',decimal='.')
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for i in caps.index:
        param = caps.loc[i,'Parameter']
        maxval = caps.loc[i,'maxval']
        minval = caps.loc[i,'minval']
        #print(param,minval,maxval)
        MIMIC_data[param][MIMIC_data[param] >= maxval] = maxval
        MIMIC_data[param][MIMIC_data[param] <= minval] = minval

#calculate fluid balance by subtracting IV - UP 
MIMIC_data['cumm_fluid_balance'] = MIMIC_data['total_IV']-MIMIC_data['total_UP']
## Recalculate BMI 
MIMIC_data['bmi'] = MIMIC_data.Weight / (MIMIC_data.Height/100)**2

In [None]:
MIMIC_data.to_csv('../commondata/MIMIC_data_clean_traj.csv', index=False)

##### Train test split data set using icustay as group

In [None]:
gs_1 = GroupShuffleSplit(n_splits=1, train_size=.80, random_state=42)
train_ix, val_ix = next(gs_1.split(MIMIC_data, groups=MIMIC_data.icustay_id))
train_set = MIMIC_data.iloc[train_ix].reset_index(drop=True)
val_set = MIMIC_data.iloc[val_ix].reset_index(drop=True)
train_ids = train_set.icustay_id.value_counts().shape[0]
val_ids = val_set.icustay_id.value_counts().shape[0]

print('Total patient in trainset is ', train_ids)
print('Total patient in validationset is ', val_ids)

print('Total patient in trainset is ', train_ids, ' total states ', train_set.shape[0])
print('Total patient in valset is ', val_ids, ' total states', val_set.shape[0])

In [None]:
# keep a raw data copy
train_rawdata = train_set.copy()
val_rawdata = val_set.copy()

print(train_rawdata.shape)
print(val_rawdata.shape)

### All the features for Transformation/Normalization

In [None]:
binary_fields = ['Gender','Ventilator', 
                    'qsofa_gcs_score', 
                    'qsofa_resprate_score',
                     'qsofa_sysbp_score',
                     'diabetes', 'metastatic_cancer']

norm_fields= ['Age','Weight','HeartRate','SYS','MAP','DIA','RespRate','Temp','FiO2',
    'Kalium','Natrium','Chloride','Glucose','Magnesium','Calcium','ANION_GAP',
    'HB','LEU','Trombo','APTT','Art_PH','PaO2','PaCO2','Height',
    'Art_BE','Bicarbonaat','Lactate','Sofa_score','Sirs_score','Shock_Index',
    'PF_ratio','Albumine', 'Ion_Ca', 
    'mingcs', 'lods', 'elixhauser', 'cumm_fluid_balance', 'bmi', 'qsofa'] 
log_fields = ['max_VP_prev','SpO2','Ureum','Creat','ALAT','ASAT','Bili','INR',
              'Running_total_IV','total_IV_prev','Running_total_UP','total_UP', 'total_IV','max_VP']

not_used = ['subject_id', 'hadm_id', 'icustay_id', 'interval_start_time',
            'interval_end_time', 'Reward',
             'Discharge', 'discrete_action', 
             'morta_90', 'morta_hosp',  're_admission', 
              'composite_outcome', 'action', 'dbsource', 'exclude',
              'blood_culture_positive', 
              'race_black', 'race_hispanic', 'race_other', 'race_white',
              're_admission', 'elixhauser_hospital',
               'BANDS',  'discrete_IV', 'discrete_VP', 'discrete_action_original']

In [None]:
allfeatures = not_used+binary_fields+norm_fields+log_fields

In [None]:
# normalise binary fields
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    train_set[binary_fields] = train_set[binary_fields] - 0.5 
    val_set[binary_fields] = val_set[binary_fields] - 0.5 
    print("done")

In [None]:
# normal distn fields
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for item in norm_fields:
        av = train_set[item].mean()
        std = train_set[item].std()
        train_set[item] = (train_set[item] - av) / std
        val_set[item] = (val_set[item] - av) / std


In [None]:
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    # log normal fields
    train_set[log_fields] = np.log(0.1 + train_set[log_fields])
    val_set[log_fields] = np.log(0.1 + val_set[log_fields])
    
    
    for item in log_fields:
        av = train_set[item].mean()
        std = train_set[item].std()
        train_set[item] = (train_set[item] - av) / std
        val_set[item] = (val_set[item] - av) / std
        

In [None]:
import copy
# scale all features
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)

# min-max normalization
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    for col in scalable_fields:
        minimum = np.nanmin(train_set[col])
        maximum = np.nanmax(train_set[col])
        #print(col,minimum,maximum)
        train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
        val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
        

#### Impute data with KNN

In [None]:
### KNN 
def knn_impute(df1, featureset):
    df = df1.copy()
    impute_before = (df[featureset].isna().sum()/df[featureset].shape[0]).mean()
    print('total avearge missing before imputation ', impute_before)
    df_unique_ids = df['icustay_id'].unique()
    for unique_id in df_unique_ids:
        X_incomplete = df.loc[df['icustay_id']==unique_id][featureset]
        pd.reset_option('mode.chained_assignment')
        with pd.option_context('mode.chained_assignment', None):
            df.loc[df['icustay_id']==unique_id,featureset] = KNN(k=3,verbose=False).fit_transform(X_incomplete)
    impute_after = (df[featureset].isna().sum()/df[featureset].shape[0]).mean()
    print('total avearge missing after imputation ', impute_after)
    return df

#### Impute data based on Default state space

In [None]:
imputefeatures = binary_fields+norm_fields+log_fields
print('totalfeatures to be imputed', len(imputefeatures))


In [None]:
pd.reset_option('mode.chained_assignment')
with pd.option_context('mode.chained_assignment', None):
    train_set_imputed = knn_impute(df1=train_set.copy(), featureset= imputefeatures)
    val_set_imputed = knn_impute(df1=val_set.copy(), featureset= imputefeatures)


### Write down files 

In [None]:

# Save raw data to csv files
#train_rawdata.to_csv('../doseprediction/data/sl_train_rawdata.csv', index=False)
#val_rawdata.to_csv('../doseprediction/data/sl_val_rawdata.csv', index=False)
#pd.Series(allfeatures).to_csv('../doseprediction/data/sl_all_features.csv')

In [None]:

# Save processed data to csv files (this contain 18 state per patient)
train_set_imputed.to_csv('../doseprediction/data/sl_train.csv', index=False)
val_set_imputed.to_csv('../doseprediction/data/sl_val.csv', index=False)

