In [71]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
from google.colab import files
import io
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [2]:
datafile_train=files.upload()

Saving train.csv to train (1).csv


In [3]:
datafile_test=files.upload()

Saving test_share.csv to test_share (1).csv


In [4]:
data_med= files.upload()

Saving medical_history.csv to medical_history (1).csv


In [5]:
data_dem=files.upload()

Saving demographic_details.csv to demographic_details (1).csv


In [6]:
bd_train=pd.read_csv(io.StringIO(datafile_train["train.csv"].decode("utf-8")))
bd_test=pd.read_csv(io.StringIO(datafile_test["test_share.csv"].decode("utf-8")))
med=pd.read_csv(io.StringIO(data_med["medical_history.csv"].decode("utf-8")))
dem=pd.read_csv(io.StringIO(data_dem["demographic_details.csv"].decode("utf-8")))

In [7]:
comb=pd.merge(med,dem,how='outer',on='PatientId')

In [8]:

train=pd.merge(bd_train,comb,how='left',on='PatientId')
test=pd.merge(bd_test,comb,how='left',on='PatientId')

In [9]:
drop_cols=['PatientId', 'AppointmentID']
train.drop(drop_cols,1,inplace=True)
test.drop(drop_cols,1,inplace=True)

In [10]:

train['data']='train'
test['data']='test'

all_data=pd.concat([train,test],0,sort=False)

In [11]:
all_data.shape

(110344, 13)

In [12]:
all_data.ScheduledDay = pd.to_datetime(all_data.ScheduledDay)
all_data.AppointmentDay = pd.to_datetime(all_data.AppointmentDay)

In [13]:
all_data['target_numeric'] = np.where(all_data['No-show']=="No", 0, 1)
all_data['target_numeric'].astype('int')

# extracting dayOfWeek and hour column 

#all_data['dayOfWeek'] = all_data['AppointmentDay'].dt.day_name()
all_data['dayOfWeek'] = all_data['AppointmentDay'].dt.dayofweek 
all_data['dayOfWeek'].dtype



dtype('int64')

In [14]:
all_data['diff_bookingDays'] = (all_data['AppointmentDay']-all_data['ScheduledDay']).astype('timedelta64[D]')
all_data['diff_bookingDays'].head()
all_data['diff_bookingDays'] = np.where(all_data['diff_bookingDays']< 0, 0, all_data['diff_bookingDays'])
all_data.head()
all_data['diff_bookingDays'].astype('int')


0         0
1         0
2         0
3         0
4         1
         ..
10849     3
10850    34
10851    34
10852    40
10853    40
Name: diff_bookingDays, Length: 110344, dtype: int64

In [15]:
drop_cols=['AppointmentDay', 'ScheduledDay', 'No-show']
all_data.drop(drop_cols,1,inplace=True)

In [16]:
all_data['gender_f']=(all_data['Gender']=='F').astype(int)
del all_data['Gender']

# create dummies for Neighbourhood with cutoff

k=all_data['Neighbourhood'].value_counts()
cats=k[k>2000].index[:-1]

for cat in cats:
    name='Neighbourhood_'+cat
    all_data[name]=(all_data['Neighbourhood']==cat).astype(int)
del all_data['Neighbourhood']

In [17]:
ld_train=all_data[all_data['data']=='train']
del ld_train['data']
ld_test=all_data[all_data['data']=='test']
ld_test.drop(['target_numeric','data'],axis=1,inplace=True)

In [144]:
ld_test.shape

(10854, 30)

In [18]:
ld_train1,ld_train2=train_test_split(ld_train,test_size=0.2,random_state=2)

In [19]:
ld_train1.reset_index(drop=True,inplace=True)
ld_train2.reset_index(drop=True,inplace=True)

In [20]:
x_train1=ld_train1.drop('target_numeric',axis=1)
y_train1=ld_train1['target_numeric']

x_train2=ld_train2.drop('target_numeric',axis=1)
y_train2=ld_train2['target_numeric']

In [21]:
x_train1.shape

(79592, 30)

In [34]:
clf1=clf1=KNeighborsClassifier(n_neighbors=50)
clf2=RandomForestClassifier(n_estimators=100)
clf3=DecisionTreeClassifier()
clf4=GradientBoostingClassifier()
clf5=XGBClassifier(n_estimators=150,objective='binary:logistic',learning_rate=.01)


Algos=[clf1,clf2,clf3,clf4,clf5]

In [35]:
rows=x_train1.shape[0]
rows

79592

In [36]:
layer1=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows),
                    'clf4':np.zeros(rows), 'clf5':np.zeros(rows)})
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
79587,0.0,0.0,0.0,0.0,0.0
79588,0.0,0.0,0.0,0.0,0.0
79589,0.0,0.0,0.0,0.0,0.0
79590,0.0,0.0,0.0,0.0,0.0


In [37]:
kf=KFold(n_splits=10)

In [38]:
fold=1
for train,left_out_chunk in kf.split(x_train1):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train1.loc[train]
        y_train_train=y_train1[train]
        x_train_left_out_chunk=x_train1.loc[left_out_chunk]
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict_proba(x_train_left_out_chunk)[:,1]
        
        layer1.iloc[left_out_chunk,i]=p
        
    fold+=1  
    

fold number :  1
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  2
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  3
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  4
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  5
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  6
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  7
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  8
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  9
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  10
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5


In [39]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.02,0.000000,0.000000,0.044464,0.140969
1,0.08,0.026223,0.000000,0.059088,0.156818
2,0.06,0.243158,0.333333,0.073968,0.183828
3,0.24,0.757500,1.000000,0.284976,0.340848
4,0.28,0.050000,0.000000,0.215908,0.292237
...,...,...,...,...,...
79587,0.02,0.006667,0.000000,0.031801,0.135930
79588,0.04,0.016689,0.000000,0.031633,0.135930
79589,0.10,0.005000,0.000000,0.063669,0.156564
79590,0.20,0.260000,1.000000,0.250275,0.339573


In [42]:
rows=x_train2.shape[0]
layer2_train2=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows),
                    'clf4':np.zeros(rows),'clf5':np.zeros(rows)})

In [44]:
for i,clf in enumerate(Algos):
    print( 'Algo number',i+1)
    clf.fit(x_train1,y_train1)
    p=clf.predict_proba(x_train2)[:,1]
    
    layer2_train2.iloc[:,i]=p


Algo number 1
Algo number 2
Algo number 3
Algo number 4
Algo number 5


In [45]:
layer2_train2

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.36,0.440000,0.000000,0.242829,0.334869
1,0.22,0.034000,0.000000,0.220025,0.299505
2,0.46,0.150000,0.000000,0.354792,0.401346
3,0.12,0.000000,0.000000,0.075813,0.180482
4,0.20,0.776833,1.000000,0.184613,0.274275
...,...,...,...,...,...
19893,0.38,0.170000,0.000000,0.236142,0.291393
19894,0.34,0.840000,1.000000,0.332438,0.350988
19895,0.22,0.535000,1.000000,0.337614,0.387188
19896,0.22,0.538917,0.500000,0.292418,0.340144


In [64]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':np.linspace(0.01,1000,10)}
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [72]:
model=LogisticRegression(fit_intercept=True)
model_random=RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=10,scoring="roc_auc",cv=15)


In [73]:
model_random.fit(layer1,y_train1)

RandomizedSearchCV(cv=15, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': array([1.0000e-02, 1.1112e+02, 2.2223e+02, 3.3334e+02, 4.4445e+02,
       5.5556e+02, 6.6667e+02, 7.7778e+02, 8.8889e+02, 1.0000e+03]),
                                        'class_weight': ['balanced', None],
                       

In [57]:
logr=model_random.best_estimator_


In [58]:
logr.fit(layer1,y_train1)

LogisticRegression(C=111.12, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
logr_pre=logr.predict_proba(layer2_train2)[:,1]

In [60]:
roc_auc_score(y_train2,logr_pre)

0.723528364167185

In [68]:
param={"criterion":['gini',"entropy"],
    "max_depth":[None,5,10,15,20],
    "min_samples_split":[2,5,10,15,20],
    "min_samples_leaf":[1,2,5,10,15,20],
   "class_weight":[None,"balanced"],
   'splitter' : ['best', 'random'],
      "max_leaf_nodes":[None,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]}

In [100]:
clf=DecisionTreeClassifier()
random_search=RandomizedSearchCV(estimator=clf,param_distributions=param,scoring='roc_auc',cv=10)


In [101]:
random_search.fit(layer1,y_train1)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
         

In [102]:
DC=random_search.best_estimator_

In [103]:
DC.fit(layer1,y_train1)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=19,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [104]:
dc_pre=DC.predict_proba(layer2_train2)[:,1]

In [105]:
roc_auc_score(y_train2,dc_pre)

0.7236769435633978

In [106]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint



RF= RandomForestClassifier(verbose=1,n_jobs=-1)

param_dist = {"n_estimators":[10,100,500,700],
              "max_depth": [3,5, None],
              "max_features": sp_randint(5, 11),
              "min_samples_split": sp_randint(5, 11),
              "min_samples_leaf": sp_randint(5, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}  
# run randomized search
n_iter_search = 100
# n_iter parameter of RandomizedSeacrhCV controls, how many 
# parameter combination will be tried; out of all possible given values

random_search_RF = RandomizedSearchCV(RF, param_distributions=param_dist,
                                  n_iter=n_iter_search,scoring="roc_auc",cv=10)


In [107]:
random_search_RF.fit(layer1,y_train1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 o

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
              

In [108]:
 random_search_RF.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [109]:
rf_best=random_search_RF.best_estimator_

In [110]:
rf_best.fit(layer1,y_train1)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [111]:
rf_pre=rf_best.predict_proba(layer2_train2)[:,1]
roc_auc_score(y_train2,rf_pre)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


0.7269142604822395

In [112]:
param_dist = {
              "max_depth": [2,3,4,5,6],
              "learning_rate":[0.01,0.05,0.1,0.3,0.5],
    "min_child_weight":[4,5,6],
              "subsample":[i/10.0 for i in range(6,10)],
 "colsample_bytree":[i/10.0 for i in range(6,10)],
               "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
              "gamma":[i/10.0 for i in range(0,5)],
    "n_estimators":[100,500,700,1000],
    'scale_pos_weight':[2,3,4,5,6,7,8,9]
    
              }


In [113]:
clf_1=XGBClassifier(objective='binary:logistic')

In [133]:
n_iter=100

random_search_1=RandomizedSearchCV(clf_1,n_jobs=-1,verbose=2,cv=15,n_iter=n_iter,scoring='roc_auc',
                                 param_distributions=param_dist)

In [134]:
random_search_1.fit(layer1,y_train1)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 56.4min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 94.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 142.6min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 151.9min finished


RandomizedSearchCV(cv=15, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, s...
                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                                        'learning_rate': [0.01, 0.05, 0.1, 0.3,
                     

In [135]:
random_search_1.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.1, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=0.9, verbosity=1)

In [136]:
xgb_best=random_search_1.best_estimator_

In [137]:
xgb_best.fit(layer1,y_train1)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.1, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=0.9, verbosity=1)

In [138]:
xgb_pred=xgb_best.predict_proba(layer2_train2)[:,1]


In [139]:
roc_auc_score(y_train2,xgb_pred)

0.7271331948841455

In [140]:
x_train=ld_train.drop('target_numeric',axis=1)
y_train=ld_train['target_numeric']

In [146]:
xgb_best.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.1, reg_lambda=1, scale_pos_weight=2, seed=None,
              silent=None, subsample=0.9, verbosity=1)

In [147]:
train_score=xgb_best.predict_proba(x_train)[:,1]
train_score

array([0.09721357, 0.10929383, 0.20320986, ..., 0.43547404, 0.561932  ,
       0.42070934], dtype=float32)

In [148]:
test_score=xgb_best.predict_proba(ld_test)[:,1]
test_score

array([0.12205452, 0.45004207, 0.15397345, ..., 0.43547404, 0.51224977,
       0.42070934], dtype=float32)

In [149]:
 pd.DataFrame(test_score).to_csv("Predict no-show for medical appoitment final.csv",index=False)
