In [1]:
# import libraries 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data 
data = pd.read_csv('./diabetic_data.csv')

In [3]:
# split data into X and y 
X = data.drop('readmitted', axis = 1)
y = data['readmitted']

In [4]:
# make classification binary 
y = y.replace('>30', 'YES')
y = y.replace('<30', 'YES')

In [5]:
# drop columns that we don't need 
columns_to_drop = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty', 'examide', 'citoglipton']

X = X.drop(columns_to_drop, axis = 1)

In [6]:
import re 

X['diag_1'] = X['diag_1'].astype(str).apply(lambda x: re.sub( r'\.*$', '', x))

In [7]:
X['diag_1']

0         250.83
1            276
2            648
3              8
4            197
           ...  
101761    250.13
101762       560
101763        38
101764       996
101765       530
Name: diag_1, Length: 101766, dtype: object

In [8]:
# split the data into dev and test set 
from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, stratify = y_dev, test_size = 0.2, random_state = 10)

In [9]:
# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_train = pd.DataFrame(imp.fit_transform(X_train), columns = feature_names)
X_val = pd.DataFrame(imp.transform(X_val), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

In [10]:
# label encode target variable 
from sklearn.preprocessing import LabelEncoder 

le = LabelEncoder() 

y_train = pd.Series(le.fit_transform(y_train))
y_val = pd.Series(le.transform(y_val))
y_test = pd.Series(le.transform(y_test))

In [11]:
# create pipeline for preprocessing 
from sklearn.compose import make_column_transformer 
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from category_encoders import TargetEncoder 

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

other_features = []
for i in feature_names: 
    if i not in (te_features + oe_features): 
        other_features.append(i)

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

In [12]:
# target encode variables 
X_train = preprocess.fit_transform(X_train, y_train)
X_val = preprocess.transform(X_val)
X_test = preprocess.transform(X_test)

In [13]:
# scale the data 
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_val = ss.fit_transform(X_val)
X_test = ss.fit_transform(X_test)

In [14]:
pd.DataFrame(X_train, columns = oe_features + te_features + other_features).to_csv('X_train.csv', index = False)
pd.DataFrame(X_val, columns = oe_features + te_features + other_features).to_csv('X_val.csv', index = False)
pd.DataFrame(X_test, columns = oe_features + te_features + other_features).to_csv('X_test.csv', index = False)

y_train.to_csv('y_train.csv', index = False)
y_val.to_csv('y_val.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

Now do it again to make dev and test set!

In [15]:
X_dev, X_test, y_dev, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 10)

# impute missing values for categorical variables 
from sklearn.impute import SimpleImputer 

feature_names = X.columns

imp = SimpleImputer(missing_values = '?', strategy = 'most_frequent')

X_dev = pd.DataFrame(imp.fit_transform(X_dev), columns = feature_names)
X_test = pd.DataFrame(imp.transform(X_test), columns = feature_names)

le = LabelEncoder() 

y_dev = pd.Series(le.fit_transform(y_dev))
y_test = pd.Series(le.transform(y_test))

te_features = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
              'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 
              'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
              'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 
              'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
              'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed']

oe_features = ['age']

preprocess = make_column_transformer((OrdinalEncoder(), oe_features), 
                                    (TargetEncoder(), te_features), remainder = 'passthrough')

X_dev = preprocess.fit_transform(X_dev, y_dev)
X_test = preprocess.transform(X_test)

ss = StandardScaler()

X_dev = ss.fit_transform(X_dev)
X_test = ss.fit_transform(X_test)

pd.DataFrame(X_dev, columns = oe_features + te_features + other_features).to_csv('X_dev_final.csv', index = False)
pd.DataFrame(X_test, columns = oe_features + te_features + other_features).to_csv('X_test_final.csv', index = False)
y_dev.to_csv('y_dev_final.csv', index = False)
y_test.to_csv('y_test_final.csv', index = False)

### Adaboost

In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score
abc = AdaBoostClassifier(random_state=42)
abc_model1 = abc.fit(X_train, y_train)
y_pred = abc_model1.predict(X_test)

In [17]:
print('Base Adaboost model accuracy score', accuracy_score(y_test, y_pred))
print('Base Adaboost model F1 score', f1_score(y_test, y_pred))
print('Base Adaboost model F1 Macro score', f1_score(y_test, y_pred, average='macro'))

Base Adaboost model accuracy score 0.6384494448265697
Base Adaboost model F1 score 0.5744028685443293
Base Adaboost model F1 Macro score 0.6300719983922484


In [18]:
# # Adaboost with SVC base estimator
# from sklearn.svm import SVC
# svc = SVC(probability=True, kernel='linear')
# abc_svc = AdaBoostClassifier(base_estimator=svc, random_state=42)

# abc_model2 = abc_svc.fit(X_train, y_train)

In [19]:
# y_pred = abc_model2.predict(X_test)
# print('Adaboost with SVC base estimator accuracy score', accuracy_score(y_test, y_pred))
# print('Adaboost with SVC base estimator F1 score', f1_score(y_test, y_pred))
# print('Adaboost with SVC base estimator F1 Macro score', f1_score(y_test, y_pred, average='macro'))

In [20]:
# grid search hyperparameter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

abc_model = AdaBoostClassifier(random_state=42)

grid = dict()
grid['n_estimators'] = [50, 100, 500]
grid['learning_rate'] = [0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=abc_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

grid_result = grid_search.fit(X_dev, y_dev)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.653765 using {'learning_rate': 1.0, 'n_estimators': 500}
0.612748 (0.004867) with: {'learning_rate': 0.01, 'n_estimators': 50}
0.621874 (0.004812) with: {'learning_rate': 0.01, 'n_estimators': 100}
0.634133 (0.004820) with: {'learning_rate': 0.01, 'n_estimators': 500}
0.634620 (0.004515) with: {'learning_rate': 0.1, 'n_estimators': 50}
0.643947 (0.004530) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.652005 (0.004642) with: {'learning_rate': 0.1, 'n_estimators': 500}
0.649646 (0.004745) with: {'learning_rate': 1.0, 'n_estimators': 50}
0.652005 (0.004408) with: {'learning_rate': 1.0, 'n_estimators': 100}
0.653765 (0.004331) with: {'learning_rate': 1.0, 'n_estimators': 500}


In [22]:
best_lr, best_n_est = grid_result.best_params_['learning_rate'], grid_result.best_params_['n_estimators']
abc_model = AdaBoostClassifier(n_estimators=best_n_est, learning_rate=best_lr, random_state=42)

abc_model.fit(X_dev, y_dev)

In [23]:
y_pred = abc_model.predict(X_test)
print('Tuned Adaboost model accuracy score', accuracy_score(y_test, y_pred))
print('Tuned Adaboost model F1 score', f1_score(y_test, y_pred))
print('Tuned Adaboost model F1 Macro score', f1_score(y_test, y_pred, average='macro'))

Tuned Adaboost model accuracy score 0.6416429203105041
Tuned Adaboost model F1 score 0.5806599977003565
Tuned Adaboost model F1 Macro score 0.6339003857421746


### HistGradientBoosting

In [24]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)
hgb_model1 = hgb.fit(X_train, y_train)
y_pred = hgb_model1.predict(X_test)


In [25]:
print('Base HistGradientBoosting model accuracy score', accuracy_score(y_test, y_pred))
print('Base HistGradientBoosting model F1 score', f1_score(y_test, y_pred))
print('Base HistGradientBoosting model F1 Macro score', f1_score(y_test, y_pred, average='macro'))


Base HistGradientBoosting model accuracy score 0.6442959614817726
Base HistGradientBoosting model F1 score 0.6035049288061336
Base HistGradientBoosting model F1 Macro score 0.6404908820794745


In [26]:
# grid search hyperparameter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

hgb_model = HistGradientBoostingClassifier(random_state=42)

grid = {
    "max_depth": [25, 50, 75],
    "max_iter": [100, 500, 1000],
    "learning_rate": [0.01, 0.05, 0.1, 1],
}
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=hgb_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

grid_result = grid_search.fit(X_dev, y_dev)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.657594 using {'learning_rate': 0.05, 'max_depth': 25, 'max_iter': 500}
0.643501 (0.005242) with: {'learning_rate': 0.01, 'max_depth': 25, 'max_iter': 100}
0.656259 (0.005116) with: {'learning_rate': 0.01, 'max_depth': 25, 'max_iter': 500}
0.657200 (0.004557) with: {'learning_rate': 0.01, 'max_depth': 25, 'max_iter': 1000}
0.643501 (0.005242) with: {'learning_rate': 0.01, 'max_depth': 50, 'max_iter': 100}
0.656259 (0.005116) with: {'learning_rate': 0.01, 'max_depth': 50, 'max_iter': 500}
0.657200 (0.004557) with: {'learning_rate': 0.01, 'max_depth': 50, 'max_iter': 1000}
0.643501 (0.005242) with: {'learning_rate': 0.01, 'max_depth': 75, 'max_iter': 100}
0.656259 (0.005116) with: {'learning_rate': 0.01, 'max_depth': 75, 'max_iter': 500}
0.657200 (0.004557) with: {'learning_rate': 0.01, 'max_depth': 75, 'max_iter': 1000}
0.656496 (0.004129) with: {'learning_rate': 0.05, 'max_depth': 25, 'max_iter': 100}
0.657594 (0.003803) with: {'learning_rate': 0.05, 'max_depth': 25, 'max_iter':

In [28]:
best_lr, best_max_depth, best_max_iter = grid_result.best_params_['learning_rate'], grid_result.best_params_['max_depth'], grid_result.best_params_['max_iter']
hgb_model = HistGradientBoostingClassifier(max_depth=best_max_depth, learning_rate=best_lr, max_iter=best_max_iter, random_state=42)

hgb_model.fit(X_dev, y_dev)

In [29]:
y_pred = hgb_model.predict(X_test)
print('Tuned Adaboost model accuracy score', accuracy_score(y_test, y_pred))
print('Tuned Adaboost model F1 score', f1_score(y_test, y_pred))
print('Tuned Adaboost model F1 Macro score', f1_score(y_test, y_pred, average='macro'))

Tuned Adaboost model accuracy score 0.6469490026530411
Tuned Adaboost model F1 score 0.604599977990536
Tuned Adaboost model F1 Macro score 0.6428520436682067
