# Final Calibrated XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import SelectFromModel
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc, classification_report

In [3]:
data = pd.read_csv('./data/data_processed.csv')

data['target'] = data['readmitted'].isin(['<30','>30']).astype(int)

data = data.drop(columns='readmitted')

In [4]:
X = data.drop(columns = 'target')
y = data['target']

In [5]:
X['strat_col'] = X['race'].astype(str) + "_" + data['target'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)
X_train = X_train.drop(columns = ['strat_col' , 'race'])
X_test = X_test.drop(columns = ['strat_col' , 'race'])

In [6]:
def custom_fit(alg, train_x, train_y, cv_folds=5, early_stopping_rounds=50):
    
    
    xgb_parameters = alg.get_xgb_params()
    xgb_train = xgboost.DMatrix(train_x.values, label=train_y.values)
    cross_val = xgboost.cv(xgb_parameters, xgb_train, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='auc', early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cross_val.shape[0])
    
    #fitting the training data
    alg.fit(train_x, train_y)
        
    y_pred = alg.predict(train_x)
    y_predprobs = alg.predict_proba(train_x)[:,1]
    
    print ("\nModel Report")
    print ("Cassification report:\n", classification_report(train_y, y_pred))
    print ("Accuracy : %.4g" % accuracy_score(train_y, y_pred))
    print ("AUC Score (Train): %f" % roc_auc_score(train_y, y_predprobs))


In [7]:
#features = X_train.columns.tolist()

xgb = XGBClassifier(
 learning_rate=0.1,
 n_estimators=500,
 max_depth=5,
 min_child_weight=3,
 gamma=0.01,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=42)

custom_fit(xgb, X_train, y_train)


Model Report
Cassification report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.85     40814
           1       0.88      0.73      0.80     36872

    accuracy                           0.83     77686
   macro avg       0.84      0.82      0.82     77686
weighted avg       0.83      0.83      0.82     77686

Accuracy : 0.8258
AUC Score (Train): 0.896758


In [8]:
#Probably need to search in a smaller/different range for these
param_test1 = {
 'max_depth':range(1,10,1),
 'min_child_weight':range(1,7,1)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=500, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=-1, cv=5)
gsearch1.fit(X_train,y_train)
#gsearch1.cv_results_, 
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 4, 'min_child_weight': 6}, np.float64(0.8757363071863793))

In [12]:
%%time
#XGB calibration:

parameters = {
    'max_depth': range(3, 11, 1),
    'min_child_weight': range(1, 11, 1),
    'gamma': np.linspace(0, 1, 10),
    'subsample': np.linspace(0.5, 1, 10),
    'colsample_bytree': np.linspace(0.5, 1, 10),
    'n_estimators': range(200, 1000, 100),
    'learning_rate': np.logspace(-3,0,10),
    'reg_alpha': np.logspace(-4,2,10),
    'reg_lambda': np.logspace(-4,2,10)
}

xgb = XGBClassifier(objective= 'binary:logistic', random_state = 42)

grid_search = RandomizedSearchCV(estimator = xgb, n_iter = 50,
                           param_distributions=parameters,
                           scoring = 'roc_auc',
                           n_jobs=-1, verbose=1, cv=5)

grid_search.fit(X_train, y_train)

params1 = grid_search.best_params_

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'subsample': np.float64(0.8888888888888888), 'reg_lambda': np.float64(0.00046415888336127773), 'reg_alpha': np.float64(4.641588833612772), 'n_estimators': 900, 'min_child_weight': 9, 'max_depth': 5, 'learning_rate': np.float64(0.046415888336127774), 'gamma': np.float64(0.8888888888888888), 'colsample_bytree': np.float64(0.7222222222222222)}
0.8773121393971449
CPU times: total: 7.86 s
Wall time: 2min 54s


In [15]:
xgbCal = XGBClassifier(**params1, object = 'binary:logistic', random_state=42)

custom_fit(xgbCal, X_train, y_train)
#xgbCal.get_params()

Parameters: { "object" } are not used.

Parameters: { "object" } are not used.

Parameters: { "object" } are not used.




Model Report
Cassification report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.84     40814
           1       0.88      0.73      0.80     36872

    accuracy                           0.82     77686
   macro avg       0.83      0.82      0.82     77686
weighted avg       0.83      0.82      0.82     77686

Accuracy : 0.824
AUC Score (Train): 0.895707


In [15]:
np.linspace(0.5,1,5)

array([0.5  , 0.625, 0.75 , 0.875, 1.   ])

In [18]:
np.linspace(0.5, 1, 10)

array([0.5       , 0.55555556, 0.61111111, 0.66666667, 0.72222222,
       0.77777778, 0.83333333, 0.88888889, 0.94444444, 1.        ])

In [26]:
a = [0].extend(np.logspace(-3,3,10).tolist())

In [29]:
print(a)

None


In [30]:
a=[0]

In [33]:
print(a.extend(np.logspace(-3,3,10).tolist()))

None
