# Step 1: import required libraries

In [163]:
import pandas as pd
import numpy as np
import seaborn as sns

# !pip install category-encoders
from xgboost import XGBClassifier
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, f1_score

# Step 2: import dataset

In [111]:
df = pd.read_csv("./booking_cleaned.csv")

# Step 3: separate x and y(target)

In [112]:
X = df.drop("Booking_Status", axis=1)
y = df["Booking_Status"]

# Step 4: one hot encoding to low cardinality columns

In [113]:
low_card_cols = ["Vehicle_Type", "Payment_Method", "meridiem", "day_type"]
X = pd.get_dummies(X, columns=low_card_cols, drop_first=True)

# Step 5: train test split

In [114]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Step 6: Target Encoding (NO LEAKAGE)

In [115]:
te = TargetEncoder(cols=["Pickup_Location", "Drop_Location"])

X_train[["Pickup_Location", "Drop_Location"]] = te.fit_transform(
    X_train[["Pickup_Location", "Drop_Location"]],
    y_train
)

X_test[["Pickup_Location", "Drop_Location"]] = te.transform(
    X_test[["Pickup_Location", "Drop_Location"]]
)

# Step 7: label encoding to y variable to use xgboost

In [116]:
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Step 8: Compute balanced sample weights

In [117]:
sample_weights = compute_sample_weight(
    class_weight="balanced",
    y=y_train_enc
)

# Step 9: build xgboost classfication model

In [134]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=4,

    n_estimators=600,
    learning_rate=0.03,

    max_depth=3,
    min_child_weight=15,

    subsample=0.7,
    colsample_bytree=0.7,
    
    gamma=0.1,
    reg_alpha=1, # sparsity introduce karta hai aur irrelevant features ka impact kam karta hai
    reg_lambda=3, # model weights ko smooth karta hai aur overfitting control karta hai.

    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

# Step 9: Fit the model on train data

In [135]:
xgb_model.fit(
    X_train,
    y_train_enc,
    sample_weight=sample_weights,
    eval_set=[(X_train, y_train_enc), (X_test, y_test_enc)],
    verbose=False
)

In [145]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

for train_idx, val_idx in skf.split(X_train, y_train_enc):
    X_tr = X_train.iloc[train_idx]
    X_val = X_train.iloc[val_idx]

    y_tr = y_train_enc[train_idx]
    y_val = y_train_enc[val_idx]

    model = XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        n_estimators=600,
        learning_rate=0.03,
        max_depth=3,
        min_child_weight=15,
        subsample=0.7,
        colsample_bytree=0.7,
        gamma=0.1,
        reg_alpha=1,
        reg_lambda=3,
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_tr, y_tr)

    y_val_pred = model.predict(X_val)

    f1_scores.append(
        f1_score(y_val, y_val_pred, average="macro")
    )

print("Mean CV Macro F1:", np.mean(f1_scores))
print("Std CV Macro F1:", np.std(f1_scores))

Mean CV Macro F1: 0.4108402477795378
Std CV Macro F1: 0.00043189406119468164


# Step 10: train data evaluation

In [155]:
train_preds = np.argmax(xgb_model.predict_proba(X_train), axis=1)
train_labels = le.inverse_transform(train_preds)


[1 3 3 ... 3 3 3]


# Step 11: train data confusion matrix

In [137]:
print(confusion_matrix(y_train, train_labels))

[[ 3625  2291  2483     0]
 [ 4486  5761  4500     0]
 [ 2363  2285  3451     0]
 [    0     0     0 51174]]


# Step 12: train data classification report

In [138]:
print(classification_report(y_train,train_labels))

                      precision    recall  f1-score   support

Canceled by Customer       0.35      0.43      0.38      8399
  Canceled by Driver       0.56      0.39      0.46     14747
    Driver Not Found       0.33      0.43      0.37      8099
             Success       1.00      1.00      1.00     51174

            accuracy                           0.78     82419
           macro avg       0.56      0.56      0.55     82419
        weighted avg       0.79      0.78      0.78     82419



# Step 13: test data evaluation

In [139]:
test_preds = np.argmax(xgb_model.predict_proba(X_test), axis=1)
test_labels = le.inverse_transform(test_preds)

# test data classification report

In [140]:
print(classification_report(y_test, test_labels))

                      precision    recall  f1-score   support

Canceled by Customer       0.27      0.33      0.30      2100
  Canceled by Driver       0.48      0.33      0.39      3687
    Driver Not Found       0.25      0.33      0.29      2025
             Success       1.00      1.00      1.00     12793

            accuracy                           0.75     20605
           macro avg       0.50      0.50      0.49     20605
        weighted avg       0.76      0.75      0.75     20605



In [141]:
train_accuracy = accuracy_score(y_train,train_labels)
test_accuracy = accuracy_score(y_test,test_labels)
train_accuracy*100,test_accuracy*100

(0.7766534415608052, 0.7471487503033244)