# Step 1: import required libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

# !pip install category-encoders
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

# Step 2: import dataset

In [70]:
df = pd.read_csv("./booking_cleaned.csv")
df.shape

(103024, 22)

# Step 3: separate x and y(target)

In [4]:
x = df.drop("Booking_Status",axis=1)
y = df["Booking_Status"]

# Step 4: one hot encoding to low cardinality columns

In [5]:
low_card_cols = ["Vehicle_Type", "Payment_Method", "meridiem", "day_type"]
x = pd.get_dummies(x,columns=low_card_cols,drop_first=True)

# Step 5: train test split

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

# Step 6: target encoding to medium cardinality columns

In [7]:
te = TargetEncoder(cols=["Pickup_Location","Drop_Location"])

x_train[["Pickup_Location", "Drop_Location"]] = te.fit_transform(
    x_train[["Pickup_Location", "Drop_Location"]],
    y_train
)

x_test[["Pickup_Location", "Drop_Location"]] = te.transform(
    x_test[["Pickup_Location", "Drop_Location"]]
)

# Step 7: build random forest model with class imbalance focused

In [54]:
rf_model = RandomForestClassifier(
    n_estimators=300,        # number of trees
    max_depth=5,            # prevents overfitting
    max_features=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1                # use all CPU cores
)
rf_model.fit(x_train, y_train)

# Step 8: K-Fold Cross Validation

In [55]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

cv_scores = []

for train_idx, val_idx in skf.split(x_train, y_train):
    x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = RandomForestClassifier(
    n_estimators=300,        # number of trees
    max_depth=5,            # prevents overfitting
    max_features=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1                # use all CPU cores
    )

    model.fit(x_tr, y_tr)
    y_val_pred = model.predict(x_val)

    cv_scores.append(accuracy_score(y_val, y_val_pred))

In [56]:
print("Mean CV Accuracy:", np.mean(cv_scores))
print("Std CV Accuracy:", np.std(cv_scores))

Mean CV Accuracy: 0.7476795649376193
Std CV Accuracy: 0.0018073112434274197


# Step 9: model prediction on train data

In [57]:
y_train_pred = rf_model.predict(x_train)

# Step 10: model evaluation on train data

In [58]:
print(classification_report(y_train,y_train_pred))

                      precision    recall  f1-score   support

Canceled by Customer       0.31      0.43      0.36      8399
  Canceled by Driver       0.52      0.33      0.40     14747
    Driver Not Found       0.30      0.39      0.34      8099
             Success       1.00      1.00      1.00     51174

            accuracy                           0.76     82419
           macro avg       0.53      0.54      0.53     82419
        weighted avg       0.77      0.76      0.76     82419



# Step 11: accuracy check

In [64]:
train_score = accuracy_score(y_train,y_train_pred)
train_score

0.761050243269149

In [65]:
y_test_pred = rf_model.predict(x_test)

In [66]:
test_score = accuracy_score(y_test,y_test_pred)
test_score

0.7447706867265227

In [67]:
print(classification_report(y_test,y_test_pred))

                      precision    recall  f1-score   support

Canceled by Customer       0.26      0.34      0.30      2100
  Canceled by Driver       0.48      0.30      0.37      3687
    Driver Not Found       0.26      0.36      0.30      2025
             Success       1.00      1.00      1.00     12793

            accuracy                           0.74     20605
           macro avg       0.50      0.50      0.49     20605
        weighted avg       0.76      0.74      0.75     20605



# Step 12: overfitting check

In [68]:

round(train_score*100,2),round(test_score*100,2),round(round(train_score*100,2)-round(test_score*100,2),2)


(76.11, 74.48, 1.63)

In [None]:
n_estimators=300,max_depth=5,max_features=5,train=76.13,test=74.11,diff=2.02 good
n_estimators=300,max_depth=6,max_features=5,train=77.09,test=75.54,diff=2.55 X
n_estimators=300,max_depth=6,max_features=6,train=77.04,test=74.68,diff=2.36 X
n_estimators=300,max_depth=6,max_features=4,train=77.13,test=74.64,diff=2.49 X

# Step 13: extra information(feature importance who drives the values)

In [42]:
feature_importance = pd.DataFrame({
    "Feature": x.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importance["Importance"] = feature_importance["Importance"].apply(lambda x:round(x*100,2))
feature_importance

Unnamed: 0,Feature,Importance
6,Driver_Ratings,11.69
7,Customer_Rating,10.93
16,Payment_Method_Ind,10.44
14,Driver_Ratings_Ind,10.35
13,C_TAT_IND,9.95
12,V_TAT_IND,9.73
3,C_TAT,9.38
2,V_TAT,8.62
15,Customer_Rating_Ind,7.76
5,Ride_Distance,7.5


In [None]:
# Better result at the end from random forest model
n_estimators=300,max_depth=6,max_features=5,train=77.09,test=75.54,diff=1.55,macro_f1=0.55
