In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)

import scipy.stats
from decimal import Decimal, getcontext

import eli5
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.metrics import fbeta_score

In [2]:
train = pd.read_hdf('../input/diabetic_train.h5')
train.fillna(-1, inplace=True)
train['test'] = 0

test = pd.read_hdf('../input/diabetic_test.h5')
test.fillna(-1, inplace=True)
test['readmitted'] = np.nan
test['test'] = 1

In [3]:
# changing features to numbers
def feature_engineering(df, cat_features):
    for feature in cat_features:
        df['{}_cat'.format(feature)] = df[feature].factorize()[0]
    
    df['change'] = df['change'].replace({True: 1, False: 0}).astype('int8')
    df['diabetesMed'] = df['diabetesMed'].replace({True: 1, False: 0}).astype('int8')    

    df['comorbidity_score'] = np.where(df['diag_1'].isin(['428','398','402','404','410']), 1, 0)
    df['comorbidity_score'] += np.where(df['diag_2'].isin(['428','398','402','404','410']), 1, 0)
    df['comorbidity_score'] += np.where(df['diag_3'].isin(['428','398','402','404','410']), 1, 0)
    df['comorbidity_score'] = df.groupby('patient_nbr')['comorbidity_score'].transform('cumsum')

    df['medication_adherence'] = df.groupby('patient_nbr')['num_medications'].transform(lambda x: x.diff().fillna(0).cumsum() / x.cumsum())

    df['days_in_hospital'] = df['discharge_disposition_id'].apply(lambda x: 0 if x == 11 else int(str(x)[-1]))
    df['days_in_hospital'] += df[['time_in_hospital', 'number_inpatient', 'number_outpatient']].sum(axis=1)
    df['length_of_stay'] = df.groupby('patient_nbr')['days_in_hospital'].transform('cumsum')

    df['num_lab_procedures_pp'] = df.groupby('patient_nbr')['num_lab_procedures'].transform('cumsum')
    df['num_procedures_pp'] = df.groupby('patient_nbr')['num_procedures'].transform('cumsum')
    df['num_medications_pp'] = df.groupby('patient_nbr')['num_medications'].transform('cumsum')

    df['number_outpatient_pp'] = df.groupby('patient_nbr')['number_outpatient'].transform('cumsum')
    df['number_emergency_pp'] = df.groupby('patient_nbr')['number_emergency'].transform('cumsum')
    df['number_diagnoses_pp'] = df.groupby('patient_nbr')['number_diagnoses'].transform('cumsum')
    df['change_pp'] = df.groupby('patient_nbr')['change'].transform('cumsum')
    df['diabetesMed_pp'] = df.groupby('patient_nbr')['diabetesMed'].transform('cumsum')

    df['number_inpatient_pp_cs'] = df.groupby('patient_nbr')['number_inpatient'].transform('cumsum')
   
    df['number_inpatient_pp_cs_mean'] = df.groupby('patient_nbr')['number_inpatient_pp_cs'].transform('mean')
    df['number_inpatient_pp_cs_median'] = df.groupby('patient_nbr')['number_inpatient_pp_cs'].transform('median')
    df['number_inpatient_pp_cs_std'] = df.groupby('patient_nbr')['number_inpatient_pp_cs'].transform('std')   

    return df
    
def get_feats(df, black_list=None):
    if black_list is None:
        black_list = ['id', 'readmitted']
        
    num_bool_feats = df.select_dtypes(include=[np.number, bool]).columns.values
    feats = [feat for feat in num_bool_feats if feat not in black_list]
    return feats

In [4]:
#merge train & test into one dataframe
dx = pd.concat([train, test], sort=False)
df_all = dx.sort_values(['patient_nbr', 'encounter_id'], ascending=[True, True])

cat_features = df_all.select_dtypes(include=object).columns

In [5]:
df_all = feature_engineering(df_all.copy(), cat_features)
feats = get_feats(df_all)

In [6]:
df_train = df_all[ df_all['test'] == 0 ]
df_test = df_all[ df_all['test'] == 1 ].copy()

X_train = df_train[feats].values
y_train = df_train["readmitted"].values
X_test =  df_test[feats].values

print(X_train.shape, X_test.shape)

(33051, 66) (33170, 66)


In [7]:
# df_all.info()

In [8]:
df_all[['encounter_id', 'patient_nbr', 'number_inpatient', 'number_inpatient_pp_cs', 'test', 'readmitted']][df_all.patient_nbr == 88785891]
# df_all[['encounter_id', 'patient_nbr', 'number_inpatient', 'number_inpatient_pp_cs', 'test', 'readmitted']][df_all.readmitted == 0]

Unnamed: 0,encounter_id,patient_nbr,number_inpatient,number_inpatient_pp_cs,test,readmitted
40252,125094312,88785891,2,2,1,
44515,137245596,88785891,5,7,0,1.0
45147,139425576,88785891,6,13,0,1.0
45986,141994242,88785891,7,20,1,
50167,150986298,88785891,9,29,0,1.0
50393,151413846,88785891,9,38,1,
50773,152188656,88785891,10,48,1,
51519,153558456,88785891,11,59,0,1.0
53771,157363182,88785891,13,72,0,1.0
62759,174689286,88785891,11,83,1,


In [9]:
threshold = 0.15

xgb_model = xgb.XGBClassifier(learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=7, 
                      gamma=10,
                      random_state=0)

xgb_model.fit(X_train, y_train)

y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > threshold).astype("int8")

In [10]:
df_test['readmitted'] = y_pred
df_test[ ['id', 'readmitted'] ].to_csv('../output/DP_28E_27.csv', index=False) 

In [11]:
X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split(X_train, y_train, shuffle=True, random_state=0)

model = xgb.XGBClassifier(learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=7, 
                      gamma=10,
                      random_state=0)

model.fit(X_sub_train, y_sub_train)
perm = PermutationImportance(model).fit(X_sub_test, y_sub_test)
eli5.show_weights(perm, feature_names=feats, top=75)

Weight,Feature
0.1747  ± 0.0069,number_inpatient_pp_cs_std
0.0238  ± 0.0015,number_inpatient
0.0208  ± 0.0017,number_inpatient_pp_cs
0.0139  ± 0.0021,number_inpatient_pp_cs_mean
0.0032  ± 0.0009,encounter_id
0.0027  ± 0.0003,number_diagnoses_pp
0.0023  ± 0.0012,number_inpatient_pp_cs_median
0.0012  ± 0.0006,patient_nbr
0.0012  ± 0.0007,discharge_disposition_id
0.0005  ± 0.0003,payer_code_cat
