# Final Calibrated XGBoost

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectFromModel
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc, classification_report

In [132]:
data = pd.read_csv('./data/data_processed.csv')

data['target'] = data['readmitted'].isin(['<30','>30']).astype(int)

data = data.drop(columns='readmitted')

In [133]:
X = data.drop(columns = 'target')
y = data['target']

In [134]:
X['strat_col'] = X['race'].astype(str) + "_" + data['target'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)
X_train = X_train.drop(columns = ['strat_col' , 'race'])
X_test = X_test.drop(columns = ['strat_col' , 'race'])

In [135]:
#XGB = Pipeline(steps=[
#    ('preprocessor', preprocessor), 
#    ('model', xgboost.XGBClassifier(**best_params, eval_metric='logloss', random_state=43))  
#])
#
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=43)
#
#XGB.fit(X_train, y_train)
#
#probs =XGB.predict_proba(X_test)
#y_pred = (probs[:, 1]>=0.5)
#
#false_pos_rate, true_pos_rate, threshold = roc_curve(y_test, y_pred)
#print(auc(false_pos_rate, true_pos_rate))
#print(accuracy_score(y_test, y_pred))

In [136]:
def custom_fit(alg, train_x, train_y, cv_folds=5, early_stopping_rounds=50):
    
    
    xgb_parameters = alg.get_xgb_params()
    xgb_train = xgboost.DMatrix(train_x.values, label=train_y.values)
    cross_val = xgboost.cv(xgb_parameters, xgb_train, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cross_val.shape[0])
    
    #fitting the training data
    alg.fit(train_x, train_y)
        
    y_pred = alg.predict(train_x)
    y_predprobs = alg.predict_proba(train_x)[:,1]
    
    print ("\nModel Report")
    print ("Cassification report:\n", classification_report(train_y, y_pred))
    print ("Accuracy : %.4g" % accuracy_score(train_y, y_pred))
    print ("AUC Score (Train): %f" % roc_auc_score(train_y, y_predprobs))


In [137]:
#features = X_train.columns.tolist()

xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=500,
 max_depth=5,
 min_child_weight=3,
 gamma=0.01,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=42)

custom_fit(xgb, X_train, y_train)


Model Report
Cassification report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.85     40814
           1       0.88      0.73      0.80     36872

    accuracy                           0.83     77686
   macro avg       0.84      0.82      0.82     77686
weighted avg       0.83      0.83      0.82     77686

Accuracy : 0.8258
AUC Score (Train): 0.896758


In [None]:
#Probably need to search in a smaller/different range for these
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=-1, cv=5)
gsearch1.fit(X_train,y_train)
#gsearch1.cv_results_, 
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 7, 'min_child_weight': 5}, np.float64(0.8763136033192191))