In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, date

import lightgbm as lgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seed = 3407
seed_everything(seed)

In [3]:
df_train = pd.read_csv("../input/cascade-cup-22/train.csv")

In [4]:
df_test = pd.read_csv("../input/cascade-cup-22/test.csv")
sample_sub = pd.read_csv("../input/cascade-cup-22/sample_submission.csv")

## Data Preprocessing

In [5]:
df_train.columns

In [6]:
df_test.columns

In [7]:
df_train.isna().sum()

In [8]:
df_test.isna().sum()

In [9]:
def daypart(hour):
    if hour in [2,3,4,5]:
        return "dawn"
    elif hour in [6,7,8,9]:
        return "morning"
    elif hour in [10,11,12,13]:
        return "noon"
    elif hour in [14,15,16,17]:
        return "afternoon"
    elif hour in [18,19,20,21]:
        return "evening"
    else: 
        return "midnight"

In [10]:
def create_order_no_column(data):
    data['allot_time'] = data['allot_time'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))
    rider_df = []
    for rider in list(data['rider_id'].value_counts().keys()):
        rider_df.append(data[data['rider_id']==rider])
    
    order_number_for_this_day = []
    for ind,rider in enumerate(rider_df):

        curr_list = [1]
        allot_time_list = rider['allot_time'].values
        start = datetime.strptime(str(allot_time_list[0]).split('.')[0],'%Y-%m-%dT%H:%M:%S').date()

        for indx in range(1,rider.shape[0]):

            curr = datetime.strptime(str(allot_time_list[indx]).split('.')[0],'%Y-%m-%dT%H:%M:%S').date()

            if ((curr-start).days) == 0:
                curr_list.append(curr_list[indx-1]+1)
            else:
                curr_list.append(1)
                start = curr

        order_number_for_this_day.append(curr_list)
        
    for i in range(len(rider_df)):
        rider_df[i].reset_index(inplace=True)
        rider_df[i]["order_number_for_this_day"] = pd.DataFrame(np.array(order_number_for_this_day[i]))
        
    data = pd.concat(rider_df, ignore_index=True)
    
    data.sort_values(by = 'index', inplace = True)
    data.reset_index(inplace = True)
    
    data.drop(columns = ["index", "level_0"], inplace = True)
    
    return data

In [11]:
def create_time_deltas_train(df):
    df['mod_order_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df['order_time'].values]))
    df['mod_allot_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df['allot_time'].values]))

    accept_time = []
    pickup_time = []

    for elem in df['accept_time'].values:

        if type(elem)==float and math.isnan(elem):
            accept_time.append(np.nan)
        else:
            accept_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())

    for elem in df['pickup_time'].values:

        if type(elem)==float and math.isnan(elem):
            pickup_time.append(np.nan)
        else:
            pickup_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())

    df['mod_accept_time'] = pd.DataFrame(np.array(accept_time))
    df['mod_pickup_time'] = pd.DataFrame(np.array(pickup_time))
    
    mod_order_time = df['mod_order_time'].values
    mod_allot_time = df['mod_allot_time'].values
    mod_accept_time = df['mod_accept_time'].values
    mod_pickup_time = df['mod_pickup_time'].values

    time_delta_1 = []
    time_delta_2 = []
    time_delta_3 = []
    time_delta_4 = []
    
    for indx in range(df.shape[0]):
    
        if(type(mod_allot_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_1.append(round((datetime.combine(date.min, mod_allot_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_1.append(np.nan)

        if(type(mod_accept_time[indx])!=float and type(mod_allot_time[indx])!=float):
            time_delta_2.append(round((datetime.combine(date.min, mod_accept_time[indx]) - datetime.combine(date.min,mod_allot_time[indx])).seconds/60,2))
        else:
            time_delta_2.append(np.nan)
            
        if(type(mod_pickup_time[indx])!=float and type(mod_accept_time[indx])!=float):
            time_delta_3.append(round((datetime.combine(date.min, mod_pickup_time[indx]) - datetime.combine(date.min,mod_accept_time[indx])).seconds/60,2))
        else:
            time_delta_3.append(np.nan)

        if(type(mod_pickup_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_4.append(round((datetime.combine(date.min, mod_pickup_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_4.append(np.nan)


    df['TD_1_mins'] = pd.DataFrame(np.array(time_delta_1))
    df['TD_2_mins'] = pd.DataFrame(np.array(time_delta_2))
    df['TD_3_mins'] = pd.DataFrame(np.array(time_delta_3))
    df['TD_4_mins'] = pd.DataFrame(np.array(time_delta_4))
    
    idx = df[ (df['TD_4_mins'] > 34.75) & (df['cancelled'] == 0) ].index
    
    df.drop(idx, inplace = True)
    
    df['TD_2_mins'].fillna(df['TD_2_mins'].median(), inplace = True)
    
    cols = ['mod_order_time', 'mod_allot_time', 'mod_accept_time', 'mod_pickup_time', 'TD_3_mins', 'TD_4_mins']
    
    df.drop(columns = cols, inplace = True)
    
    return "Done"

In [12]:
def create_time_deltas_test(df):
    df = df.copy(deep = True)
    df['mod_order_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df['order_time'].values]))
    df['mod_allot_time'] = pd.DataFrame(np.array([datetime.strptime(i.split()[1],'%H:%M:%S').time() for i in df['allot_time'].values]))

    accept_time = []

    for elem in df['accept_time'].values:

        if type(elem)==float and math.isnan(elem):
            accept_time.append(np.nan)
        else:
            accept_time.append(datetime.strptime(elem.split()[1], '%H:%M:%S').time())
            
    df['mod_accept_time'] = pd.DataFrame(np.array(accept_time))
    
    mod_order_time = df['mod_order_time'].values
    mod_allot_time = df['mod_allot_time'].values
    mod_accept_time = df['mod_accept_time'].values
    
    time_delta_1 = []
    time_delta_2 = []
    
    for indx in range(df.shape[0]):
    
        if(type(mod_allot_time[indx])!=float and type(mod_order_time[indx])!=float):
            time_delta_1.append(round((datetime.combine(date.min, mod_allot_time[indx]) - datetime.combine(date.min,mod_order_time[indx])).seconds/60,2))
        else:
            time_delta_1.append(np.nan)

        if(type(mod_accept_time[indx])!=float and type(mod_allot_time[indx])!=float):
            time_delta_2.append(round((datetime.combine(date.min, mod_accept_time[indx]) - datetime.combine(date.min,mod_allot_time[indx])).seconds/60,2))
        else:
            time_delta_2.append(np.nan)
            
    return time_delta_1, time_delta_2

In [13]:
def preprocess(df, typee = "train"):
    
    print("Preprocessing Started : ", typee)
    
    if typee == "train":
        for idx in range(len(df)):
            if pd.isna(df["alloted_orders"][idx]) and pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "alloted_orders"] = df.loc[idx, "delivered_orders"] = df.loc[idx, "undelivered_orders"] = 0
            elif pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "delivered_orders"] = 0
                df.loc[idx, "undelivered_orders"] = df["alloted_orders"][idx]
        cols_to_rem_train = ['delivered_time', 'cancelled_time']
        df.drop(columns = cols_to_rem_train, inplace = True)
    else:
        for idx in range(len(df)):
            if pd.isna(df["alloted_orders"][idx]) and pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "alloted_orders"] = df.loc[idx, "delivered_orders"] = df.loc[idx, "undelivered_orders"] = 0
            elif pd.isna(df["delivered_orders"][idx]) and pd.isna(df["undelivered_orders"][idx]) :
                df.loc[idx, "delivered_orders"] = 0
                df.loc[idx, "undelivered_orders"] = df["alloted_orders"][idx]
    
    
    print("Step - 1 Completed !!!")
    
#     if typee == "train":
#         flag = create_time_deltas_train(df)
#     else:
    time_delta_1, time_delta_2 = create_time_deltas_test(df)
    df['TD_1_mins'] = pd.DataFrame(np.array(time_delta_1))
    df['TD_2_mins'] = pd.DataFrame(np.array(time_delta_2))
    df.fillna(df['TD_2_mins'].median(), inplace = True)
    
    print("Step - 2 Completed !!!")
    
    df["lifetime_order_count"].fillna(0, inplace = True)
    df["session_time"].fillna(0, inplace = True)
    df["reassigned_order"].fillna(0, inplace = True)
    
    print("Step - 3 Completed !!!")
    
    df = create_order_no_column(df)
    
    print("Step - 4 Completed !!!")
    
    df.order_time = pd.to_datetime(df.order_time)

    df.allot_time = pd.to_datetime(df.allot_time)

    df.accept_time = pd.to_datetime(df.accept_time)
    
    print("Step - 5 Completed !!!")
    
    if typee == "train":
        df = df[df["accept_time"].notna()]

    df["order_month"] = df.order_time.dt.month
    df["order_hour"] = df.order_time.dt.hour
    df["allot_hour"] = df.allot_time.dt.hour
    df["order_day"] = df.order_time.dt.day_name()
    
    print("Step - 6 Completed !!!")

    order_hour = df.order_time.dt.hour
    df["order_dayparts"] = order_hour.apply(daypart)
    allot_hour = df.allot_time.dt.hour
    df["allot_dayparts"] = allot_hour.apply(daypart)

    day_names = df.order_time.dt.day_name()
    is_weekend = day_names.apply(lambda x : 1 if x in ['Saturday','Sunday'] else 0)
    df["is_weekend"] = is_weekend

    df["accept_day"] = df.accept_time.dt.day_name()
    df["accept_hour"] = df.accept_time.dt.hour
    
    print("Step - 7 Completed !!!")
    
    if typee == "test":
        df["accept_day"].fillna("Sunday", inplace = True)
        df["accept_hour"].fillna(2, inplace = True)
    
    ordr_id = df.order_id
    
    df["order_day"] = df["order_day"].astype("category")
    df["order_dayparts"] = df["order_dayparts"].astype("category")
    df["allot_dayparts"] = df["allot_dayparts"].astype("category")
    df["accept_day"] = df["accept_day"].astype("category")
    
    print("Step - 8 Completed !!!")
    
    cols_to_rem = [
        "reassignment_method", "reassignment_reason", "reassigned_order", 
        "order_time", "order_date", "allot_time", "accept_time", "order_id"
    ]
    
    if typee == "train":
        cols_to_rem += ['pickup_time']
    
    df.drop(columns = cols_to_rem, inplace = True)
    
    print("Step - 9 Completed !!!")
    
    print("!!! DONE !!!")
    
    return df, ordr_id

In [14]:
df_train, train_order_id = preprocess(df_train, "train")

In [15]:
df_test, test_order_id = preprocess(df_test, "test")

In [16]:
print("Train : ", df_train.shape)
print("Test : ", df_test.shape)

In [17]:
df_train.columns

In [18]:
df_test.columns

In [19]:
df_train.isna().sum()

In [20]:
df_test.isna().sum()

In [21]:
df_train.dtypes

In [22]:
df_test.dtypes

In [23]:
cat_features = [
    "allot_hour", "order_day", "order_dayparts", 
    "allot_dayparts", "is_weekend", "accept_day",
    "accept_hour", "order_month", "order_hour",
]

num_features = [
    'first_mile_distance', 'last_mile_distance',
    'alloted_orders', 'delivered_orders', 'undelivered_orders', 
    'lifetime_order_count', 'session_time', 'TD_1_mins', 'TD_2_mins',
    'order_number_for_this_day'
]

## Modelling

In [24]:
y = df_train.cancelled
X = df_train.drop('cancelled', axis = 1)

In [25]:
print(y.shape)
print(X.shape)

In [26]:
print(df_test.shape)

In [27]:
rs = RobustScaler()
X[num_features] = rs.fit_transform(X[num_features])
df_test[num_features] = rs.transform(df_test[num_features])

### XGBoost

In [28]:
kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = seed)

test_preds_xgb = []

for fold, (trn_ind, val_ind) in enumerate(kfold.split(X, y)):
    print(f"=====================fold: {fold + 1}=====================")
    
    X_train, y_train = X.iloc[trn_ind], y.iloc[trn_ind]
    X_valid, y_valid = X.iloc[val_ind], y.iloc[val_ind]
    
    model = XGBClassifier(
        n_estimators = 800, eval_metric = 'auc', tree_method="gpu_hist",
        enable_categorical = True, use_label_encoder = False
    )
    
    model.fit(X_train, y_train, early_stopping_rounds = 150, eval_set = [(X_valid, y_valid)], verbose = 100)
    
    preds_valid = model.predict_proba(X_valid)
    
    roc = roc_auc_score(y_valid, preds_valid[:,1])
    
    test_pred = model.predict_proba(df_test)[:,1]
    test_preds_xgb.append(test_pred)
    
    print(F'fold {fold + 1}: ROC AUC {roc}')

In [29]:
test_preds_xgb

In [30]:
test_preds_xgb = np.array(test_preds_xgb)

final_preds_xgb = np.mean(test_preds_xgb, axis = 0)

In [31]:
final_preds_xgb

In [32]:
sample_sub.head()

In [33]:
sample_sub["cancelled"] = final_preds_xgb

In [34]:
sample_sub.to_csv('final-xgb-2.csv', index = False)