# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# IMPORTING DATASET

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/My Drive/Rider Driven/train.csv')
test = pd.read_csv('/content/drive/My Drive/Rider Driven/test.csv')

In [None]:
train.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,pickup_time,delivered_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time,cancelled_time
0,2021-01-26 02:21:35,556753,2021-01-26 00:00:00,2021-01-26 02:21:59,2021-01-26 02:22:08,2021-01-26 02:32:51,2021-01-26 02:49:47,11696,1.5666,2.65,46.0,46.0,0,0.0,621.0,,,,,
1,2021-01-26 02:33:16,556754,2021-01-26 00:00:00,2021-01-26 02:33:57,2021-01-26 02:34:45,2021-01-26 02:50:25,2021-01-26 03:11:15,18117,2.5207,2.76,8.0,8.0,0,0.0,105.0,,,,3.266667,
2,2021-01-26 02:39:49,556755,2021-01-26 00:00:00,2021-01-26 02:39:57,2021-01-26 02:40:13,2021-01-26 02:56:00,2021-01-26 03:12:46,18623,2.2074,4.8,1.0,1.0,0,0.0,66.0,,,,9.816667,
3,2021-01-26 02:47:53,556756,2021-01-26 00:00:00,2021-01-26 02:48:25,2021-01-26 02:49:06,2021-01-26 03:21:51,2021-01-26 03:41:05,15945,2.1894,6.38,1.0,1.0,0,0.0,127.0,,,,17.533333,
4,2021-01-26 03:06:30,556757,2021-01-26 00:00:00,2021-01-26 03:07:21,2021-01-26 03:07:57,2021-01-26 03:31:38,2021-01-26 04:00:15,17589,2.787,4.01,34.0,34.0,0,0.0,84.0,,,,1.35,


# FEATURE ENGINEERING

In [None]:
train_=train.groupby(['rider_id']).apply(lambda x: x.sort_values(['order_time'], ascending = True)).reset_index(drop=True)
test_=test.groupby(['rider_id']).apply(lambda x: x.sort_values(['order_time'], ascending = True)).reset_index(drop=True)

In [None]:
train_.head()

In [None]:
train_['session_time'].fillna(train_['session_time'].mean(),inplace=True)
test_['session_time'].fillna(test_['session_time'].mean(),inplace=True)

In [None]:
train_['total_distance']=train_['first_mile_distance']+train_['last_mile_distance']
test_['total_distance']=test_['first_mile_distance']+test_['last_mile_distance']

In [None]:
train_['session_time_diff']=train_.session_time.apply(lambda x: x[i+1]-x[i] if(x[i+1]>x[i]) else x[i] for i in range(0,len(train_)-1)).reset_index(drop=True)
test_['session_time_diff']=test_.session_time.apply(lambda x: x[i+1]-x[i] if(x[i+1]>x[i]) else x[i] for i in range(0,len(test_)-1)).reset_index(drop=True)

In [None]:
train_['session_time_diff'].fillna(train_['session_time'][len(train_)-1],inplace=True)
train_['reassigned_order'].fillna(0,inplace=True)
test_['session_time_diff'].fillna(test_['session_time'][len(test_)-1],inplace=True)
test_['reassigned_order'].fillna(0,inplace=True)

In [None]:
train_['feat_1']=pd.to_datetime(train_['order_time']).dt.hour*60+pd.to_datetime(train_['order_time']).dt.minute
test_['feat_1']=pd.to_datetime(test_['order_time']).dt.hour*60+pd.to_datetime(test_['order_time']).dt.minute

In [None]:
train_['feat_2']=pd.to_datetime(train_['allot_time']).dt.hour*60+pd.to_datetime(train_['allot_time']).dt.minute
test_['feat_2']=pd.to_datetime(test_['allot_time']).dt.hour*60+pd.to_datetime(test_['allot_time']).dt.minute

In [None]:
train_['feat_3']=pd.to_datetime(train_['accept_time']).dt.hour*60+pd.to_datetime(train_['accept_time']).dt.minute
test_['feat_3']=pd.to_datetime(test_['accept_time']).dt.hour*60+pd.to_datetime(test_['accept_time']).dt.minute

In [None]:
train_['feat_3'].fillna(train_['feat_3'].mean(),inplace=True)
test_['feat_3'].fillna(test_['feat_3'].mean(),inplace=True)

In [None]:
train_['feat_4']=train_['feat_2']-train_['feat_1']
test_['feat_4']=test_['feat_2']-test_['feat_1']

In [None]:
train_['feat_5']=train_['feat_3']-train_['feat_2']
test_['feat_5']=test_['feat_3']-test_['feat_2']

In [None]:
train_['allot_time_diff']=train_.feat_2.apply(lambda x: x[i+1]-x[i] if(x[i+1]>x[i]) else 0 for i in range(0,len(train_)-1)).reset_index(drop=True)
test_['allot_time_diff']=test_.feat_2.apply(lambda x: x[i+1]-x[i] if(x[i+1]>x[i]) else 0 for i in range(0,len(test_)-1)).reset_index(drop=True)

In [None]:
train_['allot_time_diff'].fillna(train_['allot_time_diff'].mean(),inplace=True)
test_['allot_time_diff'].fillna(test_['allot_time_diff'].mean(),inplace=True)

In [None]:
train_['lifetime_order_count'].fillna(train_['lifetime_order_count'].median(),inplace=True)
test_['lifetime_order_count'].fillna(test_['lifetime_order_count'].median(),inplace=True)

In [None]:
train_['delivered_orders'].fillna(train_['delivered_orders'].median(),inplace=True)
test_['delivered_orders'].fillna(test_['delivered_orders'].median(),inplace=True)

In [None]:
train_['alloted_orders'].fillna(train_['alloted_orders'].median(),inplace=True)
test_['alloted_orders'].fillna(test_['alloted_orders'].median(),inplace=True)

In [None]:
train_['undelivered_orders'].fillna(train_['undelivered_orders'].median(),inplace=True)
test_['undelivered_orders'].fillna(test_['undelivered_orders'].median(),inplace=True)

In [None]:
train_['feat_6']=(train_['delivered_orders']+1)/(train_['alloted_orders']+1)
test_['feat_6']=(test_['delivered_orders']+1)/(test_['alloted_orders']+1)

In [None]:
train_['feat_7']=(train_['session_time_diff']+1)/(train_['session_time']+1)
test_['feat_7']=(test_['session_time_diff']+1)/(test_['session_time']+1)

In [None]:
df_train=train_[['first_mile_distance','last_mile_distance','total_distance','session_time_diff','reassigned_order','feat_4','feat_5','allot_time_diff','lifetime_order_count','feat_6','feat_7','cancelled']]
df_test=test_[['first_mile_distance','last_mile_distance','total_distance','session_time_diff','reassigned_order','feat_4','feat_5','allot_time_diff','lifetime_order_count','feat_6','feat_7']]

# TRAIN-TEST SPLIT

In [None]:
train_X = df_train.drop('cancelled',axis=1)
train_Y = df_train['cancelled']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_X,train_Y,test_size=0.2,random_state=7,stratify=np.array(train_Y))

In [None]:
eval_set=[(X_test, y_test)]

# HYPERPARAMETER TUNING

In [None]:
xgbc_1=xgb.XGBClassifier(learning_rate =0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
xgbc_1.fit(X_train,y_train,eval_metric='auc',eval_set=eval_set,early_stopping_rounds=100,verbose=True)
predictions = xgbc_1.predict(X_test)
accuracy_score(y_test,predictions)

In [None]:
plot_importance(xgbc_1)
plt.show()

In [None]:
xgbc_2=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test2 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch2 = GridSearchCV(estimator=xgbc_2,param_grid=param_test2,scoring='roc_auc',n_jobs=-1,cv=5)
gsearch2.fit(X_train,y_train,eval_metric='auc')
gsearch2.best_params_

In [None]:
gsearch2.best_score_

In [None]:
xgbc_3=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=3,min_child_weight=5,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test3 = {
 'max_depth':[2,3,4],
 'min_child_weight':[4,5,6]
}
gsearch3 = GridSearchCV(estimator=xgbc_3,param_grid=param_test3,scoring='roc_auc',n_jobs=4,cv=5)
gsearch3.fit(X_train,y_train,eval_metric='auc')
gsearch3.best_params_

In [None]:
gsearch3.best_score_

In [None]:
xgbc_4=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=4,min_child_weight=4,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch4=GridSearchCV(estimator=xgbc_4,param_grid=param_test4,scoring='roc_auc',n_jobs=4,cv=5)
gsearch4.fit(X_train,y_train,eval_metric='auc')
gsearch4.best_params_

In [None]:
gsearch4.best_score_

In [None]:
xgbc_5=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=4,min_child_weight=4,gamma=0.4,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch5=GridSearchCV(estimator=xgbc_5,param_grid=param_test5,scoring='roc_auc',n_jobs=4,cv=5)
gsearch5.fit(X_train,y_train,eval_metric='auc')
gsearch5.best_params_

In [None]:
gsearch5.best_score_

In [None]:
xgbc_6=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=4,min_child_weight=4,gamma=0.4,subsample=0.9,colsample_bytree=0.6,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator=xgbc_6,param_grid=param_test6,scoring='roc_auc',n_jobs=4,cv=5)
gsearch6.fit(X_train,y_train,eval_metric='auc')
gsearch6.best_params_

In [None]:
gsearch6.best_score_

In [None]:
xgbc_7=xgb.XGBClassifier(learning_rate =0.1,n_estimators=100,max_depth=4,min_child_weight=4,gamma=0.4,subsample=0.9,colsample_bytree=0.6,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
param_test7 = {
 'reg_alpha':[95,96,97,98,99,100]
}
gsearch7 = GridSearchCV(estimator=xgbc_7,param_grid=param_test7,scoring='roc_auc',n_jobs=4,cv=5)
gsearch7.fit(X_train,y_train,eval_metric='auc')
gsearch7.best_params_

In [None]:
gsearch7.best_score_

In [None]:
xgbc_9=xgb.XGBClassifier(learning_rate =0.01,n_estimators=5000,max_depth=4,min_child_weight=4,gamma=0.4,subsample=0.9,colsample_bytree=0.6,reg_alpha=100,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
xgbc_9.fit(X_train,y_train,eval_metric='auc',eval_set=eval_set,early_stopping_rounds=100,verbose=True)
predictions=xgbc_9.predict_proba(X_test)
roc_auc_score(y_test,predictions[:,1])

[1061]	validation_0-auc:0.78906
[1062]	validation_0-auc:0.78908
[1063]	validation_0-auc:0.78911
[1064]	validation_0-auc:0.78912
[1065]	validation_0-auc:0.78913
[1066]	validation_0-auc:0.78913
[1067]	validation_0-auc:0.78913
[1068]	validation_0-auc:0.78912
[1069]	validation_0-auc:0.78910
[1070]	validation_0-auc:0.78910
[1071]	validation_0-auc:0.78911
[1072]	validation_0-auc:0.78912
[1073]	validation_0-auc:0.78912
[1074]	validation_0-auc:0.78912
[1075]	validation_0-auc:0.78912
[1076]	validation_0-auc:0.78910
[1077]	validation_0-auc:0.78909
[1078]	validation_0-auc:0.78909
[1079]	validation_0-auc:0.78908
[1080]	validation_0-auc:0.78907
[1081]	validation_0-auc:0.78906
[1082]	validation_0-auc:0.78911
[1083]	validation_0-auc:0.78912
[1084]	validation_0-auc:0.78913
[1085]	validation_0-auc:0.78912
[1086]	validation_0-auc:0.78912
[1087]	validation_0-auc:0.78913
[1088]	validation_0-auc:0.78912
[1089]	validation_0-auc:0.78911
[1090]	validation_0-auc:0.78916
[1091]	validation_0-auc:0.78916
[1092]	v

0.7894581724418754

# FINAL PREDICTION

In [None]:
xgbc_10=xgb.XGBClassifier(learning_rate =0.01,n_estimators=1200,max_depth=4,min_child_weight=4,gamma=0.4,subsample=0.9,colsample_bytree=0.6,reg_alpha=100,objective='binary:logistic',nthread=4,scale_pos_weight=85,seed=27)
xgbc_10.fit(train_X,train_Y,eval_metric='auc')

In [None]:
predict=xgbc_10.predict_proba(train_X)
roc_auc_score(train_Y,predict[:,1])

In [None]:
plot_importance(xgbc_10)
plt.show()

In [None]:
predictions=xgbc_10.predict(df_test)

In [None]:
Df = pd.DataFrame(test_['order_id'])
Df['cancelled']=predictions
Df.set_index('order_id',inplace=True)

In [None]:
Df.to_csv('submission21.csv')

In [None]:
Df[Df['cancelled']==1]