In [1]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pdb
%matplotlib inline

In [2]:
df = pd.read_csv('no_show_prev_adj.csv', index_col=0)

In [3]:
def get_acc(rf, X, y):
    return (rf.predict(X) == y).sum() / len(y)

In [4]:
df.head()

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,...,no_show,scheduledday_year,scheduledday_month,scheduledday_dow,appointmentday_year,appointmentday_month,appointmentday_dow,scheduledday_hour,date_diff,prev_no_s
0,29872499824296,5642903,F,2016-04-29 18:38:08,2016-04-29 00:00:00,62,JARDIM DA PENHA,0,1,0,...,0,2016,4,4,2016,4,4,18,0,0.0
2151,725775968562,5521232,M,2016-03-29 11:09:08,2016-04-29 00:00:00,33,MARIA ORTIZ,0,0,0,...,0,2016,3,1,2016,4,4,11,31,0.0
2152,94755722517728,5521230,F,2016-03-29 11:08:52,2016-04-29 00:00:00,50,MARIA ORTIZ,0,0,0,...,0,2016,3,1,2016,4,4,11,31,0.0
2153,35387553979251,5523393,F,2016-03-29 17:04:40,2016-04-29 00:00:00,69,MARIA ORTIZ,0,0,0,...,0,2016,3,1,2016,4,4,17,31,0.0
2154,732498986588399,5642808,F,2016-04-29 17:21:24,2016-04-29 00:00:00,65,MARIA ORTIZ,0,0,0,...,0,2016,4,4,2016,4,4,17,0,0.0


In [5]:
df.isnull().sum()

patientid               0
appointmentid           0
gender                  0
scheduledday            0
appointmentday          0
age                     0
neighbourhood           0
scholarship             0
hipertension            0
diabetes                0
alcoholism              0
handcap                 0
sms_received            0
no_show                 0
scheduledday_year       0
scheduledday_month      0
scheduledday_dow        0
appointmentday_year     0
appointmentday_month    0
appointmentday_dow      0
scheduledday_hour       0
date_diff               0
prev_no_s               0
dtype: int64

In [6]:
df.dtypes

patientid                 int64
appointmentid             int64
gender                   object
scheduledday             object
appointmentday           object
age                       int64
neighbourhood            object
scholarship               int64
hipertension              int64
diabetes                  int64
alcoholism                int64
handcap                   int64
sms_received              int64
no_show                   int64
scheduledday_year         int64
scheduledday_month        int64
scheduledday_dow          int64
appointmentday_year       int64
appointmentday_month      int64
appointmentday_dow        int64
scheduledday_hour         int64
date_diff                 int64
prev_no_s               float64
dtype: object

In [7]:
cols = df.columns
cols = [col.strip().lower().replace('-', '_') for col in cols]
df.columns = cols

In [8]:
df['patientid'] = df.patientid.astype(str)

Parse dates

In [9]:
for col in ['scheduledday', 'appointmentday']:
    df[col] = pd.to_datetime(df[col])
    df[col+'_year'] = df[col].dt.year
    df[col+'_month'] = df[col].dt.month
    df[col+'_dow'] = df[col].dt.dayofweek
df['scheduledday_hour'] = df.scheduledday.dt.hour
df['date_diff'] = (df.appointmentday.dt.date - df.scheduledday.dt.date).dt.days 



Same day

In [10]:
df['same_day'] = (df.appointmentday.dt.date == df.scheduledday.dt.date).astype(int)

In [11]:
df.drop(['patientid', 'appointmentid', 'appointmentday', 'scheduledday'], axis=1, inplace=True)

In [12]:
from sklearn.preprocessing import LabelEncoder
ngb_enc = LabelEncoder()
gen_enc = LabelEncoder()
df['neighbourhood'] = ngb_enc.fit_transform(df.neighbourhood)
df['gender'] = gen_enc.fit_transform(df.gender)

- gender: label encoder
- age: maybe standardize
- neighborhood: dummies
- handcap: dummies
- year dro, month drop
- dow: dummies


In [13]:
tr, test = train_test_split(df, test_size=0.2, random_state=17)

In [14]:
X_tr = tr.drop('no_show', axis=1) 
X_test = test.drop('no_show', axis=1)
y_tr = tr.no_show
y_test = test.no_show

In [15]:
X_tr.shape, y_tr.shape, X_test.shape, y_test.shape

((88421, 19), (88421,), (22106, 19), (22106,))

In [16]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [62]:
dall = xgb.DMatrix(df.drop('no_show', axis=1).values, label=df.no_show.values)

In [67]:
xgb_param =xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, dall, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='error', early_stopping_rounds=50)

250 estimators

In [72]:
cvresult[cvresult['test-error-mean'] == cvresult['test-error-mean'].min()]

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
251,0.196707,0.002465,0.188093,0.000624


In [75]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=250, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='accuracy',n_jobs=1,iid=False, cv=5, verbose=10)
gsearch1.fit(df.drop('no_show', axis=1),df.no_show)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV]  max_depth=3, min_child_weight=1, score=0.8014566181127296, total=   5.5s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.8018185108115444, total=   5.5s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.4s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.794671130009952, total=   6.2s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.9s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.7995476136620674, total=   6.9s
[CV] max_depth=3, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   25.1s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=1, score=0.7984980094100615, total=  11.5s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   36.8s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.8016828010494889, total=   6.7s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   43.7s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.8008233058898037, total=   6.4s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   50.3s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.7953949154075817, total=   7.0s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   57.6s remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.8009500113096585, total=   8.6s
[CV] max_depth=3, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min remaining:    0.0s


[CV]  max_depth=3, min_child_weight=3, score=0.7986789721317409, total=   7.9s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.8015923278747851, total=   7.1s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.8015470912874333, total=   7.2s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.7957115715190446, total=   6.7s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.7999095227324134, total=   6.5s
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.7985884907709012, total=   6.2s
[CV] max_depth=5, min_child_weight=1 .................................
[CV]  max_depth=5, min_child_weight=1, score=0.8006875961277481, total=  10.2s
[CV] max_depth=5, min

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 12.4min finished


([mean: 0.79920, std: 0.00257, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.79951, std: 0.00229, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.79947, std: 0.00219, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.79469, std: 0.00691, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.79435, std: 0.00680, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.79364, std: 0.00776, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.78812, std: 0.01267, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.78879, std: 0.01171, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.78766, std: 0.01291, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.78041, std: 0.01705, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.78285, std: 0.01584, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.78249, std: 0.01642, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 3, 'min_child_weight': 3

max_depth:3, min_child_weight:3

In [76]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=250, max_depth=3,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring='accuracy',n_jobs=1,iid=False, cv=5, verbose=10)
gsearch1.fit(df.drop('no_show', axis=1),df.no_show)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] gamma=0.0 .......................................................
[CV] .............. gamma=0.0, score=0.8016828010494889, total=   5.6s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.8s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.8008233058898037, total=   5.6s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.6s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.7953949154075817, total=   5.8s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.7s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.8009500113096585, total=   5.8s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   23.8s remaining:    0.0s


[CV] .............. gamma=0.0, score=0.7986789721317409, total=   5.2s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   29.2s remaining:    0.0s


[CV] ............... gamma=0.1, score=0.801637564462137, total=   5.2s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   34.5s remaining:    0.0s


[CV] .............. gamma=0.1, score=0.8008233058898037, total=   5.3s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   40.1s remaining:    0.0s


[CV] .............. gamma=0.1, score=0.7953949154075817, total=   6.4s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   46.7s remaining:    0.0s


[CV] .............. gamma=0.1, score=0.8009500113096585, total=   6.0s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   53.0s remaining:    0.0s


[CV] .............. gamma=0.1, score=0.7988599348534202, total=   6.0s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.8015923278747851, total=   5.8s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.8012756717633222, total=   5.6s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.7953949154075817, total=   5.5s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.8009952499434517, total=   5.5s
[CV] gamma=0.2 .......................................................
[CV] .............. gamma=0.2, score=0.7988599348534202, total=   5.5s
[CV] gamma=0.3 .......................................................
[CV] .............. gamma=0.3, score=0.8015923278747851, total=   5.5s
[CV] gamma=0.3 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.4min finished


([mean: 0.79951, std: 0.00229, params: {'gamma': 0.0},
  mean: 0.79953, std: 0.00227, params: {'gamma': 0.1},
  mean: 0.79962, std: 0.00232, params: {'gamma': 0.2},
  mean: 0.79967, std: 0.00236, params: {'gamma': 0.3},
  mean: 0.79952, std: 0.00230, params: {'gamma': 0.4}],
 {'gamma': 0.3},
 0.79966885655586395)

gamma 0.3

In [18]:
dtrain = xgb.DMatrix(X_tr.values, label=y_tr.values)
dtest =  xgb.DMatrix(X_test.values, label=y_test.values)

In [19]:
param = {'max_depth': 3, 'min_child_weight':3, 'gamma':0.3, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'error'
evallist = [(dtrain, 'train'), (dtest, 'test')]


In [20]:
bst = xgb.train(param, dtrain, num_boost_round=250, evals=evallist)

[0]	train-error:0.201253	test-error:0.201755
[1]	train-error:0.198765	test-error:0.198408
[2]	train-error:0.198358	test-error:0.198724
[3]	train-error:0.198166	test-error:0.198046
[4]	train-error:0.197442	test-error:0.197548
[5]	train-error:0.197261	test-error:0.19791
[6]	train-error:0.197521	test-error:0.197865
[7]	train-error:0.197544	test-error:0.198091
[8]	train-error:0.19751	test-error:0.198046
[9]	train-error:0.197363	test-error:0.198724
[10]	train-error:0.19725	test-error:0.198543
[11]	train-error:0.197069	test-error:0.198498
[12]	train-error:0.196967	test-error:0.198589
[13]	train-error:0.196989	test-error:0.198046
[14]	train-error:0.196922	test-error:0.198136
[15]	train-error:0.196741	test-error:0.198815
[16]	train-error:0.1963	test-error:0.199041
[17]	train-error:0.196537	test-error:0.199358
[18]	train-error:0.196627	test-error:0.199629
[19]	train-error:0.196571	test-error:0.199493
[20]	train-error:0.196548	test-error:0.199312
[21]	train-error:0.196571	test-error:0.199493
[22

In [22]:
bst.get_fscore()

{'f0': 25,
 'f1': 333,
 'f10': 58,
 'f11': 66,
 'f13': 30,
 'f14': 82,
 'f15': 168,
 'f16': 296,
 'f17': 93,
 'f18': 15,
 'f2': 346,
 'f3': 24,
 'f4': 23,
 'f5': 14,
 'f6': 17,
 'f7': 34,
 'f8': 27,
 'f9': 1}

In [31]:
list(zip(X_tr.columns, [v for k,v in bst.get_score(importance_type='gain').items()]))

[('gender', 409.98054219999995),
 ('age', 5.29241806306306),
 ('neighbourhood', 4.764208908783789),
 ('scholarship', 11.098701376344085),
 ('hipertension', 3.4863856488095193),
 ('diabetes', 7.553930896551723),
 ('alcoholism', 6.115114814814818),
 ('handcap', 5.154091133333335),
 ('sms_received', 3.4044036473988464),
 ('scheduledday_year', 3.5886207083333335),
 ('scheduledday_month', 4.010998000000001),
 ('scheduledday_dow', 3.7861642608695645),
 ('appointmentday_year', 3.922022071428572),
 ('appointmentday_month', 2.9386040151515136),
 ('appointmentday_dow', 3.757400080000001),
 ('scheduledday_hour', 3.147760695121951),
 ('date_diff', 2.0972743235294113),
 ('prev_no_s', 0.595048)]

In [25]:
X_tr.columns

Index(['gender', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received',
       'scheduledday_year', 'scheduledday_month', 'scheduledday_dow',
       'appointmentday_year', 'appointmentday_month', 'appointmentday_dow',
       'scheduledday_hour', 'date_diff', 'prev_no_s', 'same_day'],
      dtype='object')