In [12]:
# import libraries 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [13]:
# load data 
data = pd.read_csv('./diabetic_data.csv')

In [14]:
# split data into X and y 
X = data.drop('readmitted', axis = 1)
y = data['readmitted']

In [15]:
# make classification binary 
y = y.replace('>30', 'YES')
y = y.replace('<30', 'YES')

In [16]:
# drop columns that we don't need 
columns_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty', 'examide', 'citoglipton']

X = X.drop(columns_to_drop, axis = 1)

In [17]:
import re 

X['diag_1'] = X['diag_1'].astype(str).apply(lambda x: re.sub( r'\.*$', '', x))

In [18]:
X['diag_1']

0         250.83
1            276
2            648
3              8
4            197
           ...  
101761    250.13
101762       560
101763        38
101764       996
101765       530
Name: diag_1, Length: 101766, dtype: object

In [19]:
# split the data into dev and test set 
from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, stratify = y_dev, test_size = 0.2, random_state = 10)

In [20]:
# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_train = pd.DataFrame(imp.fit_transform(X_train), columns = feature_names)
X_val = pd.DataFrame(imp.transform(X_val), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

In [21]:
# label encode target variable 
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder() 

y_train = pd.Series(le.fit_transform(y_train))
y_val = pd.Series(le.transform(y_val))
y_test = pd.Series(le.transform(y_test))

In [22]:
# create pipeline for preprocessing 
from sklearn.compose import make_column_transformer 
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from category_encoders import TargetEncoder 

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

other_features = []
for i in feature_names: 
    if i not in (te_features + oe_features): 
        other_features.append(i)

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

In [23]:
# target encode variables 
X_train = preprocess.fit_transform(X_train, y_train)
X_val = preprocess.transform(X_val)
X_test = preprocess.transform(X_test)

In [24]:
# scale the data 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_val = ss.fit_transform(X_val)
X_test = ss.fit_transform(X_test)

In [25]:
pd.DataFrame(X_train, columns = oe_features + te_features + other_features).to_csv('X_train.csv', index = False)
pd.DataFrame(X_val, columns = oe_features + te_features + other_features).to_csv('X_val.csv', index = False)
pd.DataFrame(X_test, columns = oe_features + te_features + other_features).to_csv('X_test.csv', index = False)

y_train.to_csv('y_train.csv', index = False)
y_val.to_csv('y_val.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

Now do it again to make dev and test set!

In [26]:
X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)

# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_dev = pd.DataFrame(imp.fit_transform(X_dev), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

le = LabelEncoder() 

y_dev = pd.Series(le.fit_transform(y_dev))
y_test = pd.Series(le.transform(y_test))

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

X_dev = preprocess.fit_transform(X_dev, y_dev)
X_test = preprocess.transform(X_test)

ss = StandardScaler()

X_dev = ss.fit_transform(X_dev)
X_test = ss.fit_transform(X_test)

pd.DataFrame(X_dev, columns = oe_features + te_features + other_features).to_csv('X_dev_final.csv', index = False)
pd.DataFrame(X_test, columns = oe_features + te_features + other_features).to_csv('X_test_final.csv', index = False)
y_dev.to_csv('y_dev_final.csv', index = False)
y_test.to_csv('y_test_final.csv', index = False)

### XGBoost

In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
import time
from hypopt import GridSearch

In [29]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train, eval_metric=f1_score)



In [44]:
# F1 score (baseline with default hyperparams)
f1_score(y_test, xgb.predict(X_test))

0.5765463499107093

In [31]:
# F1 macro score (baseline with default hyperparams)
f1_score(y_test, xgb.predict(X_test), average='macro')

0.6375364163145089

In [33]:
# Classification accuracy (baseline with default hyperparams)
accuracy_score(y_test, xgb.predict(X_test))

0.641446398742262

In [37]:
# Perform grid search
n_estimators = 50 * np.arange(1, 6) # 5
learning_rate = [0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4] # 7
max_depth = np.arange(3, 11) # 8
val_scores = np.zeros((len(n_estimators), len(learning_rate), len(max_depth)))

t0 = time.time()

for i in range(len(n_estimators)):
    estimators = n_estimators[i]
    for j in range(len(learning_rate)):
        lr = learning_rate[j]
        for k in range(len(max_depth)):
            depth = max_depth[k]
            xgb = XGBClassifier(random_state=42, n_estimators=estimators, learning_rate=lr, max_depth=depth)
            xgb.fit(X_train, y_train, eval_metric=f1_score)
            val_scores[i, j, k] = f1_score(y_val, xgb.predict(X_val), average='macro')  
            
            
t1 = time.time()





















In [38]:
print(f"Grid search time: {(t1 - t0) / 60} minutes")
print(f"Best validation score: {np.max(val_scores):.3f}")
where = np.where(val_scores == val_scores.max())
idx = where[0][0], where[1][0], where[2][0]
print(f"Best n_estimators: {n_estimators[idx[0]]}")
print(f"Best learning_rate: {learning_rate[idx[1]]}")
print(f"Best max_depth: {max_depth[idx[2]]}")

Grid search time: 28.59278609752655 minutes
Best validation score: 0.641
Best n_estimators: 250
Best learning_rate: 0.3
Best max_depth: 3


In [39]:
best_xgb = XGBClassifier(random_state=42, n_estimators=n_estimators[idx[0]],learning_rate=learning_rate[idx[1]], max_depth=max_depth[idx[2]])
best_xgb.fit(X_dev, y_dev) 



In [43]:
# Final Evaluation Scores
print(f"F1 Score: {f1_score(y_test, best_xgb.predict(X_test))}")
print(f"F1 Macro Score: {f1_score(y_test, best_xgb.predict(X_test), average='macro')}")
print(f"Accuracy Score: {accuracy_score(y_test, best_xgb.predict(X_test))}")

F1 Score: 0.5969013926953409
F1 Macro Score: 0.6406032216444422
Accuracy Score: 0.64591726441977
