In [0]:
import pandas as pd
import numpy as np

In [0]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from sklearn.model_selection import ParameterGrid

In [3]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import drive
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
df = pd.read_csv("gdrive/My Drive/cohort_aki_sepsis3_11clock16.csv")

In [0]:
df3 = pd.read_csv("gdrive/My Drive/fully_filtered_rrt.csv")

In [0]:
set1= set(df.columns)
set2 =set(df3.columns)

In [0]:
temp = list(set1-set2)

In [0]:
# > 90, change it to 91.4
df['age_yr'] = df['age_yr'].apply(lambda x: x if x<=90 else 91.4)
tot = len(df['first_wardid'])
diff_list = []
for i in range(tot):
  if df['first_wardid'][i]!=df['last_wardid'][i]:
    diff_list.append(1)
  else:
    diff_list.append(0)
sum(diff_list)
df['Ward_changed'] = pd.Series(diff_list)
# set index
df.set_index('icustay_id',inplace=True)
# race
def race(x):
  if x in ['WHITE', 'WHITE - RUSSIAN', 'WHITE - OTHER EUROPEAN', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN']:
    return "white"
  elif x in ['BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/HAITIAN' , 'BLACK/AFRICAN', 'CARIBBEAN ISLAND']:
    return "black"
  elif x in ['HISPANIC OR LATINO', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - SALVADORAN', 'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)','HISPANIC/LATINO - MEXICAN', 'HISPANIC/LATINO - COLOMBIAN' , 'HISPANIC/LATINO - HONDURAN']:
    return "hispanic"
  elif x in ['ASIAN', 'ASIAN - CHINESE', 'ASIAN - ASIAN INDIAN', 'ASIAN - VIETNAMESE', 'ASIAN - FILIPINO', 'ASIAN - CAMBODIAN', 'ASIAN - OTHER', 'ASIAN - KOREAN', 'ASIAN - JAPANESE', 'ASIAN - THAI']:
    return "asian"
  elif x in ['AMERICAN INDIAN/ALASKA NATIVE', 'AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE']:
    return "native"
  elif x in ['UNKNOWN/NOT SPECIFIED', 'UNABLE TO OBTAIN', 'PATIENT DECLINED TO ANSWER']:
    return "unknown"
  else:
    return "other"
df['ethnicity'] = df['ethnicity'].apply(race)
# change datatype
check = ['congestive_heart_failure','cardiac_arrhythmias','valvular_disease', 'pulmonary_circulation',
       'peripheral_vascular', 'hypertension', 'paralysis',
       'other_neurological', 'chronic_pulmonary', 'diabetes_uncomplicated',
       'diabetes_complicated', 'hypothyroidism', 'renal_failure',
       'liver_disease', 'peptic_ulcer', 'aids', 'lymphoma',
       'metastatic_cancer', 'solid_tumor', 'rheumatoid_arthritis',
       'coagulopathy', 'obesity', 'fluid_electrolyte',
       'blood_loss_anemia', 'deficiency_anemias', 'alcohol_abuse',
       'drug_abuse', 'psychoses', 'depression']
for column in check:
  df[column] = df[column].astype('int64')

In [0]:
group = ['subject_id','hadm_id']
cat_frs = ['first_careunit','gender','was_ventilated_24h','ethnicity','admission_type','admission_location']
cont_frs = ['age_yr','oasis',
       'height', 'weight', 'heartrate_min', 'heartrate_max', 'heartrate_mean',
       'sysbp_min', 'sysbp_max', 'sysbp_mean', 'diasbp_min', 'diasbp_max',
       'diasbp_mean', 'meanbp_min', 'meanbp_max', 'meanbp_mean',
       'resprate_min', 'resprate_max', 'resprate_mean', 'tempc_min',
       'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max', 'spo2_mean',
       'glucose_min', 'glucose_max', 'glucose_mean',
       'sapsii_score', 'sofa_score', 'los_preicu', 'max_kdigo','vaso_duration','max_vaso_dosage']
columns_to_drop = ['Unnamed: 0', 'min_ph_icustay', 'sepsis3', 'esrd', 'los', 'row_id','dbsource','last_careunit','first_wardid','last_wardid','intime','outtime','crrt_durations','crrt','dob','dod',
       'dod_hosp', 'expire_flag','admittime','dischtime', 'deathtime','diagnosis', 'hospital_expire_flag','max_kdigo_time']

In [0]:
df['was_ventilated_24h'].fillna('missing', inplace=True)
df['was_ventilated_24h'] = df['was_ventilated_24h'].astype(str)

In [0]:
df2 = df.drop(columns = columns_to_drop,axis = 1)

In [12]:
X = df2.drop(['rrt','subject_id','hadm_id'], axis=1)
y = df2['rrt']
subject_id = df2['subject_id']
print(y.value_counts()/len(y))
classes, counts = np.unique(y,return_counts=True)
print('balance:',np.max(counts/len(y)))

0.0    0.918805
1.0    0.081195
Name: rrt, dtype: float64
balance: 0.918805216659655


In [13]:
cat_indexes = []
for i in range(len(X.columns)):
  if X.columns[i] in cat_frs:
    cat_indexes.append(i)
cat_indexes

[0, 1, 3, 4, 5, 38]

In [14]:
X[cont_frs]

Unnamed: 0_level_0,age_yr,oasis,height,weight,heartrate_min,heartrate_max,heartrate_mean,sysbp_min,sysbp_max,sysbp_mean,diasbp_min,diasbp_max,diasbp_mean,meanbp_min,meanbp_max,meanbp_mean,resprate_min,resprate_max,resprate_mean,tempc_min,tempc_max,tempc_mean,spo2_min,spo2_max,spo2_mean,glucose_min,glucose_max,glucose_mean,sapsii_score,sofa_score,los_preicu,max_kdigo,vaso_duration,max_vaso_dosage
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
217232,77.0,32,157.48,69.0,81.0,113.0,96.444444,82.0,140.0,113.000000,33.0,95.0,47.518519,45.0,103.0,63.370370,12.0,25.0,17.370370,36.444444,38.722222,37.216667,97.0,100.0,99.259259,224.0,326.0,256.800000,44,6,13.769803,2.0,,
262652,24.0,21,142.24,50.0,70.0,116.0,96.807692,105.0,212.0,157.423077,48.0,114.0,86.423077,62.0,134.0,103.884615,9.0,28.0,17.153846,36.333333,38.888889,37.222222,88.0,100.0,97.384615,99.0,119.0,108.875000,24,7,2.425417,1.0,,
256064,75.0,46,172.72,77.0,67.0,103.0,84.352941,102.0,161.0,133.600000,39.0,63.0,47.800000,47.0,98.0,65.382353,9.0,25.0,16.947368,35.222222,36.722222,36.142857,95.0,100.0,99.656250,78.0,135.0,108.000000,73,9,8.757153,2.0,15.116667,212.825073
213646,55.0,46,172.72,80.4,57.0,79.0,68.115385,87.0,211.0,122.217391,55.0,91.0,69.913043,63.0,123.0,83.000000,10.0,21.0,14.031250,35.666667,37.000000,36.277778,100.0,100.0,100.000000,59.0,226.0,146.478261,50,10,0.000787,2.0,,
235675,49.0,25,187.96,139.4,80.0,92.0,85.900000,78.0,139.0,98.916667,46.0,76.0,59.416667,56.0,291.0,75.972222,13.0,28.0,19.378378,,,,94.0,100.0,98.733333,77.0,180.0,135.360000,35,8,5.632234,2.0,20.423611,2.841641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294627,91.4,40,,59.1,59.0,106.0,75.000000,92.0,147.0,111.391304,37.0,66.0,46.173913,51.0,75.0,61.130435,15.0,27.0,19.307692,35.388889,37.444444,36.252525,92.0,100.0,96.173913,75.0,123.0,104.000000,50,4,0.000660,2.0,,
213159,68.0,41,,75.0,54.0,74.0,64.631579,87.0,175.0,121.392857,54.0,94.0,64.142857,63.0,112.0,79.571429,13.0,24.0,17.454545,34.444444,36.222222,35.328283,92.0,100.0,98.050000,129.0,217.0,166.500000,49,4,0.000961,2.0,,
283653,91.4,32,,85.0,52.0,94.0,71.130435,118.0,162.0,141.090909,37.0,88.0,60.136364,58.0,113.0,80.954545,17.0,27.0,20.043478,36.166667,37.444444,36.935185,92.0,99.0,95.130435,130.0,151.0,140.500000,37,5,0.289074,3.0,,
202802,68.0,47,157.48,109.0,80.0,144.0,103.942857,89.0,140.0,112.857143,46.0,65.0,56.971429,58.0,88.0,73.314286,10.0,34.0,21.783784,35.700000,38.500000,37.534375,91.0,100.0,94.714286,100.0,192.0,139.500000,56,8,0.109838,2.0,7.383333,36.994684


In [15]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 
from sklearn.impute import SimpleImputer# doctest: +NORMALIZE_WHITESPACE
#X, y = make_classification(n_classes=2, class_sep=2,n_features=73, n_clusters_per_class=1,random_state=10)
#print('Original dataset shape %s' % Counter(y))
#Original dataset shape Counter({1: 900, 0: 100})
from imblearn.over_sampling import SMOTENC
sm = SMOTENC(random_state=42, categorical_features=cat_indexes)
simple_impute = SimpleImputer(missing_values=np.nan,strategy='mean')
#X[cont_frs]}
pd.DataFrame(simple_impute.fit_transform(.),columns=cont_frs)
#imputed_DF = pd.DataFrame(fill_NaN.fit_transform(DF))
#X_res, y_res = sm.fit_resample(X, y)
#print('Resampled dataset shape %s' % Counter(y_res))




Unnamed: 0,age_yr,oasis,height,weight,heartrate_min,heartrate_max,heartrate_mean,sysbp_min,sysbp_max,sysbp_mean,diasbp_min,diasbp_max,diasbp_mean,meanbp_min,meanbp_max,meanbp_mean,resprate_min,resprate_max,resprate_mean,tempc_min,tempc_max,tempc_mean,spo2_min,spo2_max,spo2_mean,glucose_min,glucose_max,glucose_mean,sapsii_score,sofa_score,los_preicu,max_kdigo,vaso_duration,max_vaso_dosage
0,77.0,32.0,157.48000,69.0,81.0,113.0,96.444444,82.0,140.0,113.000000,33.0,95.0,47.518519,45.0,103.0,63.370370,12.0,25.0,17.370370,36.444444,38.722222,37.216667,97.0,100.0,99.259259,224.0,326.0,256.800000,44.0,6.0,13.769803,2.0,14.638987,72.315722
1,24.0,21.0,142.24000,50.0,70.0,116.0,96.807692,105.0,212.0,157.423077,48.0,114.0,86.423077,62.0,134.0,103.884615,9.0,28.0,17.153846,36.333333,38.888889,37.222222,88.0,100.0,97.384615,99.0,119.0,108.875000,24.0,7.0,2.425417,1.0,14.638987,72.315722
2,75.0,46.0,172.72000,77.0,67.0,103.0,84.352941,102.0,161.0,133.600000,39.0,63.0,47.800000,47.0,98.0,65.382353,9.0,25.0,16.947368,35.222222,36.722222,36.142857,95.0,100.0,99.656250,78.0,135.0,108.000000,73.0,9.0,8.757153,2.0,15.116667,212.825073
3,55.0,46.0,172.72000,80.4,57.0,79.0,68.115385,87.0,211.0,122.217391,55.0,91.0,69.913043,63.0,123.0,83.000000,10.0,21.0,14.031250,35.666667,37.000000,36.277778,100.0,100.0,100.000000,59.0,226.0,146.478261,50.0,10.0,0.000787,2.0,14.638987,72.315722
4,49.0,25.0,187.96000,139.4,80.0,92.0,85.900000,78.0,139.0,98.916667,46.0,76.0,59.416667,56.0,291.0,75.972222,13.0,28.0,19.378378,36.001694,37.507275,36.772103,94.0,100.0,98.733333,77.0,180.0,135.360000,35.0,8.0,5.632234,2.0,20.423611,2.841641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4749,91.4,40.0,169.37502,59.1,59.0,106.0,75.000000,92.0,147.0,111.391304,37.0,66.0,46.173913,51.0,75.0,61.130435,15.0,27.0,19.307692,35.388889,37.444444,36.252525,92.0,100.0,96.173913,75.0,123.0,104.000000,50.0,4.0,0.000660,2.0,14.638987,72.315722
4750,68.0,41.0,169.37502,75.0,54.0,74.0,64.631579,87.0,175.0,121.392857,54.0,94.0,64.142857,63.0,112.0,79.571429,13.0,24.0,17.454545,34.444444,36.222222,35.328283,92.0,100.0,98.050000,129.0,217.0,166.500000,49.0,4.0,0.000961,2.0,14.638987,72.315722
4751,91.4,32.0,169.37502,85.0,52.0,94.0,71.130435,118.0,162.0,141.090909,37.0,88.0,60.136364,58.0,113.0,80.954545,17.0,27.0,20.043478,36.166667,37.444444,36.935185,92.0,99.0,95.130435,130.0,151.0,140.500000,37.0,5.0,0.289074,3.0,14.638987,72.315722
4752,68.0,47.0,157.48000,109.0,80.0,144.0,103.942857,89.0,140.0,112.857143,46.0,65.0,56.971429,58.0,88.0,73.314286,10.0,34.0,21.783784,35.700000,38.500000,37.534375,91.0,100.0,94.714286,100.0,192.0,139.500000,56.0,8.0,0.109838,2.0,7.383333,36.994684


In [16]:
X[cont_frs] = pd.DataFrame(simple_impute.fit_transform(X[cont_frs]),columns=cont_frs)
X[cont_frs]

Unnamed: 0_level_0,age_yr,oasis,height,weight,heartrate_min,heartrate_max,heartrate_mean,sysbp_min,sysbp_max,sysbp_mean,diasbp_min,diasbp_max,diasbp_mean,meanbp_min,meanbp_max,meanbp_mean,resprate_min,resprate_max,resprate_mean,tempc_min,tempc_max,tempc_mean,spo2_min,spo2_max,spo2_mean,glucose_min,glucose_max,glucose_mean,sapsii_score,sofa_score,los_preicu,max_kdigo,vaso_duration,max_vaso_dosage
icustay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
217232,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
262652,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
256064,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
213646,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
235675,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294627,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
213159,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
283653,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
202802,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
def ML_pipeline_xgb_GridSearchCV(X, y, groups,random_state, n_folds):
    splitter = GroupShuffleSplit(n_splits=4,test_size=0.2,random_state=random_state)
    for i_other,i_test in splitter.split(X, y, groups):
        X_other, y_other, groups_other = X.iloc[i_other], y.iloc[i_other], groups.iloc[i_other]
        X_test, y_test, groups_test = X.iloc[i_test], y.iloc[i_test], groups.iloc[i_test]
    kf = GroupKFold(n_splits=n_folds)
    #imputer = SimpleImputer()
    standard_transformer = Pipeline(steps=[('standard', StandardScaler())])
    onehot_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
    transformers=[
        ('standard', standard_transformer, cont_frs),
        ('onehot', onehot_transformer, cat_frs)])
    XGB = xgboost.XGBClassifier()
    logreg = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', XGB)])       
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('standard', standard_transformer, cont_frs),
        ('onehot', onehot_transformer, cat_frs)])

    param_grid = {'classifier__learning_rate': [0.01],
              'classifier__n_estimators': [100],
              'classifier__random_state': [random_state],
              'classifier__missing': [np.nan], 
              'classifier__max_depth': [3,5,10],
              'classifier__colsample_bytree': [0.75],              
              'classifier__subsample': [0.66]}

    xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGB)])

    grid = GridSearchCV(xgb, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                    cv=kf, return_train_score = True,iid=True,n_jobs=-1)
    grid.fit(X_other, y_other, groups_other)
    y_pred = grid.predict(X_test)
    return grid, grid.score(X_test, y_test), y_pred, y_test

In [0]:
grid,score,y_pred, y_true = ML_pipeline_xgb_GridSearchCV(X, y, groups = subject_id,random_state = 42, n_folds=4)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


In [0]:
best_xgb_model = grid.best_estimator_[1]
best_xgb_model.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.02340669,
       0.06743795, 0.08141439, 0.04149947, 0.02634412, 0.02336492,
       0.02665498, 0.11800991, 0.04225803, 0.04818384, 0.02195149,
       0.01357245, 0.0544764 , 0.        , 0.0246471 , 0.04271113,
       0.03418704, 0.02011596, 0.00718396, 0.03068252, 0.04732987,
       0.04910535, 0.04240812, 0.08524008, 0.02781428, 0.        ],
      dtype=float32)

In [0]:
score

0.9202072538860103

In [0]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96       888
         1.0       0.00      0.00      0.00        77

    accuracy                           0.92       965
   macro avg       0.46      0.50      0.48       965
weighted avg       0.85      0.92      0.88       965



  _warn_prf(average, modifier, msg_start, len(result))
