In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn import model_selection   #Additional   
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_csv('all_sheets_merge.csv')
train = train.rename(columns={'ICU APACHEII FiO2(<=1)':'ICU APACHEII FiO2', 'RCC APACHEII FiO2(<=1)':'RCC APACHEII FiO2'})
train['性別'] = train['性別'].map({'M':0, 'F':1})

target = '呼吸器成功脫離'
IDcol = 'No'

In [2]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=60):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval=None)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['呼吸器成功脫離'],eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['呼吸器成功脫離'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['呼吸器成功脫離'], dtrain_predprob))

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [3]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target,IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,   
 n_estimators=1000,
 max_depth=5,     
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [4]:
param_test1 = {
 'max_depth':range(3,10,1),
 'min_child_weight':range(1,6,1),
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_



({'mean_fit_time': array([4.53242602, 3.40353212, 3.26769142, 2.91561542, 3.21017742,
         4.05996947, 3.90236979, 3.40301023, 3.41241031, 2.96728592,
         4.0619431 , 4.05741067, 3.27107415, 3.17433181, 3.04565396,
         4.28988943, 3.74131985, 3.35063858, 3.31694779, 3.16294413,
         4.48561401, 3.73502898, 3.54653287, 3.16655569, 3.05842013,
         4.40836163, 3.69884605, 3.41887674, 3.28862653, 3.28403945,
         4.55405536, 3.65861468, 3.54957547, 3.29839959, 2.72471929]),
  'std_fit_time': array([0.31553965, 0.31026685, 0.26812609, 0.24683395, 0.12870891,
         0.23404952, 0.06404731, 0.24161583, 0.09811667, 0.2282255 ,
         0.32630569, 0.22134238, 0.18329987, 0.21742931, 0.1645391 ,
         0.1523762 , 0.16258128, 0.22117986, 0.19792969, 0.06882655,
         0.15498939, 0.19755258, 0.137778  , 0.19850168, 0.07775264,
         0.25147969, 0.18496575, 0.10143388, 0.26220423, 0.05651683,
         0.12082679, 0.17321075, 0.19413874, 0.16964184, 0.21517406]

In [5]:
param_test2 = {
 'min_child_weight' : range(6,10,1)
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=3,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_



({'mean_fit_time': array([2.95370049, 2.96447167, 2.80469861, 2.76799674]),
  'std_fit_time': array([0.04368133, 0.11696543, 0.11228992, 0.05538805]),
  'mean_score_time': array([0.07400198, 0.07340379, 0.07340422, 0.06981316]),
  'std_score_time': array([0.00791384, 0.00312859, 0.00312871, 0.00333798]),
  'param_min_child_weight': masked_array(data=[6, 7, 8, 9],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'min_child_weight': 6},
   {'min_child_weight': 7},
   {'min_child_weight': 8},
   {'min_child_weight': 9}],
  'split0_test_score': array([0.90178571, 0.86607143, 0.87723214, 0.86160714]),
  'split1_test_score': array([0.80357143, 0.77678571, 0.77455357, 0.78571429]),
  'split2_test_score': array([0.80357143, 0.78125   , 0.81919643, 0.828125  ]),
  'split3_test_score': array([0.82366071, 0.81919643, 0.82589286, 0.80803571]),
  'split4_test_score': array([0.88392857, 0.87723214, 0.92857143, 0.90178571]),
  'mean

In [6]:
param_test3 = {
 'gamma' : [i/10.0 for i in range(0,10)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=3,
 min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_



({'mean_fit_time': array([2.69065976, 2.65250611, 2.71816306, 2.62219863, 2.94363632,
         2.76094275, 2.71444988, 2.74586806, 2.59386253, 2.52069221]),
  'std_fit_time': array([0.04302595, 0.07019053, 0.2002648 , 0.13898019, 0.11504948,
         0.08238975, 0.1525909 , 0.16754723, 0.11813982, 0.28512984]),
  'mean_score_time': array([0.06602354, 0.07041178, 0.07160864, 0.07101035, 0.07061138,
         0.07041202, 0.07250786, 0.07140913, 0.06961408, 0.07041168]),
  'std_score_time': array([0.0031792 , 0.00162026, 0.00277824, 0.00097723, 0.00513996,
         0.00279254, 0.00317774, 0.00214827, 0.0031155 , 0.00391888]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
               mask=[False, False, False, False, False, False, False, False,
                     False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4},
   {'gamma'

In [7]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=3,
 min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_



({'mean_fit_time': array([1.975316  , 2.11406531, 2.21268215, 2.37923656, 2.19953876,
         2.36250234, 2.39542747, 2.61481791, 2.33096571, 2.67772865,
         2.64295955, 2.70780087, 2.41675034, 2.6445272 , 3.09022031,
         2.68880863]),
  'std_fit_time': array([0.02751313, 0.06151393, 0.07479742, 0.09222756, 0.08506114,
         0.08211736, 0.04893031, 0.12192165, 0.09386254, 0.16515029,
         0.16064487, 0.0795245 , 0.05252135, 0.04014473, 0.07210106,
         0.42640664]),
  'mean_score_time': array([0.06602354, 0.06812015, 0.06642256, 0.06662159, 0.0670207 ,
         0.06602345, 0.06721969, 0.07021241, 0.06662149, 0.06761928,
         0.06941481, 0.06861658, 0.07041216, 0.070013  , 0.07340393,
         0.06522555]),
  'std_score_time': array([0.00506196, 0.00432094, 0.00205372, 0.00342003, 0.0029853 ,
         0.0029186 , 0.00279228, 0.00325345, 0.00291798, 0.00230957,
         0.00214833, 0.00132295, 0.00264648, 0.0023088 , 0.00444243,
         0.00814107]),
  'param_c

In [8]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,100,5)],
 'colsample_bytree':[i/100.0 for i in range(75,100,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=3,
 min_child_weight=8, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.cv_results_,gsearch5.best_params_, gsearch5.best_score_



({'mean_fit_time': array([2.44486103, 2.48156338, 2.90857201, 2.86864519, 3.05922446,
         2.65290504, 2.82029681, 2.96850019, 2.80769458, 2.99329658,
         2.75084271, 3.11331773, 3.20862613, 3.12405167, 3.27324634,
         3.30005159, 2.96767306, 3.29220552, 3.40479851, 3.33320117,
         3.13746319, 2.9345561 , 3.26898022, 3.45918961, 2.93298154]),
  'std_fit_time': array([0.07345869, 0.09361442, 0.26215887, 0.16573103, 0.14508249,
         0.10430225, 0.05771128, 0.10269144, 0.13235616, 0.12241043,
         0.14369542, 0.07338084, 0.10928906, 0.15539356, 0.09215078,
         0.07592942, 0.11628083, 0.09288074, 0.25542215, 0.18389722,
         0.12044303, 0.13677189, 0.11126614, 0.07342507, 0.38186334]),
  'mean_score_time': array([0.06801834, 0.07001371, 0.07599692, 0.07938771, 0.06981344,
         0.07320414, 0.06921458, 0.07041173, 0.07190967, 0.07340345,
         0.0716085 , 0.07300482, 0.07519894, 0.0728055 , 0.0763957 ,
         0.07400203, 0.07140903, 0.0720078 , 0.