# 1. Import required libraries

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
import pickle

# 2. Import dataset

In [4]:
df = pd.read_csv("../data/cleaned/booking_cleaned.csv")

X = df.drop("Booking_Status", axis=1)
y = df["Booking_Status"]


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103024 entries, 0 to 103023
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Vehicle_Type         103024 non-null  object 
 1   Pickup_Location      103024 non-null  object 
 2   Drop_Location        103024 non-null  object 
 3   V_TAT                103024 non-null  float64
 4   C_TAT                103024 non-null  float64
 5   Booking_Value        103024 non-null  int64  
 6   Payment_Method       103024 non-null  object 
 7   Ride_Distance        103024 non-null  int64  
 8   Driver_Ratings       103024 non-null  float64
 9   Customer_Rating      103024 non-null  float64
 10  Month                103024 non-null  int64  
 11  weekday              103024 non-null  int64  
 12  date                 103024 non-null  int64  
 13  booking_hour         103024 non-null  int64  
 14  meridiem             103024 non-null  object 
 15  day_type         

# 3. Encode target variable

In [10]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 4. Column groups

In [11]:
low_card_cols = ["Vehicle_Type", "Payment_Method", "meridiem", "day_type"]
target_enc_cols = ["Pickup_Location", "Drop_Location"]
num_cols = [c for c in X.columns if c not in low_card_cols + target_enc_cols]

# 5. Preprocessing with Columntransformer

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"), low_card_cols),
        ("target", TargetEncoder(), target_enc_cols),
        ("num", "passthrough", num_cols)
    ],
    remainder="drop"
)

# 6. XGBoost Model

In [13]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=4, n_estimators=600, learning_rate=0.03, max_depth=3,
    min_child_weight=15,
    subsample=0.7, colsample_bytree=0.7,
    gamma=0.1, reg_alpha=1, reg_lambda=3,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

# 7. Build pipeline

In [14]:
pipeline = Pipeline(
    steps=[("preprocess", preprocessor),("model", xgb_model)]
)

# 8. Train test split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_enc,test_size=0.2,stratify=y_enc,random_state=42)

# 9. Handle class imbalance (sample weights) and fit with pipeline

In [16]:
sample_weights = compute_sample_weight(class_weight="balanced",y=y_train)

pipeline.fit(X_train,y_train,model__sample_weight=sample_weights)


# 10. Train data evaluation and classification report

In [17]:
train_preds = pipeline.predict(X_train)
print(classification_report(le.inverse_transform(y_train),le.inverse_transform(train_preds)))

                      precision    recall  f1-score   support

Canceled by Customer       0.35      0.43      0.39      8399
  Canceled by Driver       0.56      0.39      0.46     14747
    Driver Not Found       0.33      0.43      0.38      8099
             Success       1.00      1.00      1.00     51174

            accuracy                           0.78     82419
           macro avg       0.56      0.56      0.55     82419
        weighted avg       0.79      0.78      0.78     82419



# 11. Test data evaluation and classification report

In [18]:
test_preds = pipeline.predict(X_test)
print(classification_report(le.inverse_transform(y_test),le.inverse_transform(test_preds)))

                      precision    recall  f1-score   support

Canceled by Customer       0.27      0.33      0.30      2100
  Canceled by Driver       0.48      0.33      0.39      3687
    Driver Not Found       0.26      0.34      0.29      2025
             Success       1.00      1.00      1.00     12793

            accuracy                           0.75     20605
           macro avg       0.50      0.50      0.50     20605
        weighted avg       0.76      0.75      0.75     20605



# 12.Train and test accuracy

In [19]:
print("Train Accuracy:", accuracy_score(y_train, train_preds))
print("Test Accuracy:", accuracy_score(y_test, test_preds))

Train Accuracy: 0.7769325034276077
Test Accuracy: 0.7472458141227857


# 13. Cross validation with macro f1 score

In [22]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    pipeline,
    X,
    y_enc,
    scoring="f1_macro",
    cv=skf,
    n_jobs=-1
)

print("Mean CV Macro F1:", cv_scores.mean())
print("Std CV Macro F1:", cv_scores.std())


Mean CV Macro F1: 0.4104908858236159
Std CV Macro F1: 0.00016626133430598753


# 14. Save this model(last step)

In [20]:
with open("../models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

# we have to make custom column transformer then update the pipeline and then we will update api.py file for fastapi