# Step 1: import required libraries

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns

# !pip install category-encoders
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

# Step 2: import dataset

In [20]:
df = pd.read_csv("./booking_cleaned.csv")
df.head()

Unnamed: 0,Booking_Status,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,...,weekday,date,booking_hour,meridiem,day_type,V_TAT_IND,C_TAT_IND,Driver_Ratings_Ind,Customer_Rating_Ind,Payment_Method_Ind
0,Canceled by Driver,Prime Sedan,Tumkur Road,RT Nagar,0.0,0.0,444,Not Applicable,0,0.0,...,4,26,14,PM,weekday,0,0,0,0,0
1,Success,Bike,Magadi Road,Varthur,203.0,30.0,158,Cash,13,4.1,...,3,25,22,PM,weekday,1,1,1,1,1
2,Success,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,386,UPI,40,4.2,...,1,30,19,PM,weekday,1,1,1,1,1
3,Canceled by Customer,eBike,HSR Layout,Vijayanagar,0.0,0.0,384,Not Applicable,0,0.0,...,0,22,3,AM,weekday,0,0,0,0,0
4,Success,Mini,Rajajinagar,Chamarajpet,252.0,80.0,822,Credit Card,45,4.0,...,1,2,9,AM,weekday,1,1,1,1,1


# Step 3: separate x and y(target)

In [21]:
x = df.drop("Booking_Status",axis=1)
y = df["Booking_Status"]

# Step 4: one hot encoding to low cardinality columns

In [22]:
low_card_cols = ["Vehicle_Type", "Payment_Method", "meridiem", "day_type"]
x = pd.get_dummies(x,columns=low_card_cols,drop_first=True)

# Step 5: train test split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

# Step 6: target encoding to medium cardinality columns

In [24]:
te = TargetEncoder(cols=["Pickup_Location","Drop_Location"])

x_train[["Pickup_Location", "Drop_Location"]] = te.fit_transform(
    x_train[["Pickup_Location", "Drop_Location"]],
    y_train
)

x_test[["Pickup_Location", "Drop_Location"]] = te.transform(
    x_test[["Pickup_Location", "Drop_Location"]]
)

# Step 7: build decision tree model with class imbalance focused

In [25]:
dt_model = DecisionTreeClassifier(
    max_depth=6,
    min_samples_leaf=30,
    class_weight="balanced",
    random_state=42
)
dt_model.fit(x_train,y_train)

# Step 8: K-Fold Cross Validation

In [26]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

cv_scores = []

for train_idx, val_idx in skf.split(x_train, y_train):
    x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = DecisionTreeClassifier(
        max_depth=6,
        min_samples_leaf=30,
        class_weight="balanced",
        random_state=42
    )

    model.fit(x_tr, y_tr)
    y_val_pred = model.predict(x_val)

    cv_scores.append(accuracy_score(y_val, y_val_pred))

In [27]:
print("Mean CV Accuracy:", np.mean(cv_scores))
print("Std CV Accuracy:", np.std(cv_scores))

Mean CV Accuracy: 0.7543405408406267
Std CV Accuracy: 0.01535018619529151


# Step 8: model prediction on train data

In [28]:
y_train_pred = dt_model.predict(x_train)

# Step 9: model evaluation on train data

In [29]:
print(classification_report(y_train,y_train_pred))

                      precision    recall  f1-score   support

Canceled by Customer       0.29      0.46      0.36      8399
  Canceled by Driver       0.49      0.31      0.38     14747
    Driver Not Found       0.29      0.31      0.30      8099
             Success       1.00      1.00      1.00     51174

            accuracy                           0.75     82419
           macro avg       0.52      0.52      0.51     82419
        weighted avg       0.77      0.75      0.76     82419



# Step 10: accuracy check

In [30]:
train_score = accuracy_score(y_train,y_train_pred)
train_score

0.7545226222108979

# Step 11: prediction on test data

In [31]:
y_test_pred = dt_model.predict(x_test)

In [32]:
test_score = accuracy_score(y_test,y_test_pred)
test_score

0.7436059208929872

In [33]:
print(classification_report(y_test,y_test_pred))

                      precision    recall  f1-score   support

Canceled by Customer       0.26      0.41      0.32      2100
  Canceled by Driver       0.47      0.30      0.36      3687
    Driver Not Found       0.27      0.28      0.27      2025
             Success       1.00      1.00      1.00     12793

            accuracy                           0.74     20605
           macro avg       0.50      0.50      0.49     20605
        weighted avg       0.76      0.74      0.75     20605



# Step 12: overfitting check

In [34]:
round(train_score*100,2),round(test_score*100,2),round(round(train_score*100,2)-round(test_score*100,2),2)

(75.45, 74.36, 1.09)

# Step 13: extra information(feature importance who drives the values)

In [35]:
feature_importance = pd.DataFrame({
    "Feature": x.columns,
    "Importance": dt_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
12,V_TAT_IND,0.989741
4,Booking_Value,0.003226
1,Drop_Location,0.002461
11,booking_hour,0.001283
10,date,0.001069
0,Pickup_Location,0.000798
20,Vehicle_Type_Prime SUV,0.000567
19,Vehicle_Type_Prime Plus,0.000481
9,weekday,0.000374
8,Month,0.0
