# 1.Import required libraries

In [10]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.base import BaseEstimator, TransformerMixin
import pickle

In [11]:
import sys
sys.path.append("..")

In [12]:
from api.pipeline_utils import DataCleaner, FlagVariableGenerator, DateTimeExtractor

# 2. Import dataset

In [13]:
df= pd.read_csv("../data/raw/Bookings.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103024 entries, 0 to 103023
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Date                        103024 non-null  object 
 1   Time                        103024 non-null  object 
 2   Booking_ID                  103024 non-null  object 
 3   Booking_Status              103024 non-null  object 
 4   Customer_ID                 103024 non-null  object 
 5   Vehicle_Type                103024 non-null  object 
 6   Pickup_Location             103024 non-null  object 
 7   Drop_Location               103024 non-null  object 
 8   V_TAT                       63967 non-null   float64
 9   C_TAT                       63967 non-null   float64
 10  Canceled_Rides_by_Customer  10499 non-null   object 
 11  Canceled_Rides_by_Driver    18434 non-null   object 
 12  Incomplete_Rides            63967 non-null   object 
 13  Incomplete_Rid

# 3. Remove redundant variables

In [14]:
df = df.drop(["Time","Booking_ID","Customer_ID","Vehicle Images","Unnamed: 20","Canceled_Rides_by_Customer","Incomplete_Rides","Incomplete_Rides_Reason","Canceled_Rides_by_Driver"],axis=1)

# 4. Separate X and y variable

In [15]:
X = df.drop("Booking_Status", axis=1)
y = df["Booking_Status"]


In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103024 entries, 0 to 103023
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Date             103024 non-null  object 
 1   Vehicle_Type     103024 non-null  object 
 2   Pickup_Location  103024 non-null  object 
 3   Drop_Location    103024 non-null  object 
 4   V_TAT            63967 non-null   float64
 5   C_TAT            63967 non-null   float64
 6   Booking_Value    103024 non-null  int64  
 7   Payment_Method   63967 non-null   object 
 8   Ride_Distance    103024 non-null  int64  
 9   Driver_Ratings   63967 non-null   float64
 10  Customer_Rating  63967 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 8.6+ MB


# 5. Label encode target variable

In [17]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 9. Preprocessing with column transformer

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Vehicle_Type", "Payment_Method", "meridiem", "day_type"]),
        ("num", "passthrough", ['V_TAT','C_TAT','Booking_Value','Ride_Distance','Driver_Ratings','Customer_Rating'])
    ],
    remainder="drop"
)

# 10. XGBoost model

In [19]:
xgb_model = XGBClassifier(
    objective="multi:softprob",
    num_class=4, n_estimators=600, learning_rate=0.03, max_depth=3,
    min_child_weight=15,
    subsample=0.7, colsample_bytree=0.7,
    gamma=0.1, reg_alpha=1, reg_lambda=3,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

# 11. Building pipeline

In [20]:
pipeline = Pipeline(steps=[
    ("cleaner", DataCleaner()),
    ("datetime_features", DateTimeExtractor()),
    ("flag_features", FlagVariableGenerator()),
    ("target_enc", TargetEncoder(cols=["Pickup_Location", "Drop_Location"],smoothing=0.3)),
    ("preprocess", preprocessor),
    ("model", xgb_model)
])

# 12. Train test split 

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y_enc,test_size=0.2,stratify=y_enc,random_state=42)

# 13. Handle class imbalance (sample weights) and fit with pipeline

In [22]:
sample_weights = compute_sample_weight(class_weight="balanced",y=y_train)

pipeline.fit(X_train,y_train,model__sample_weight=sample_weights)

# 14. Train data classfication report

In [23]:
train_preds = pipeline.predict(X_train)
print(classification_report(le.inverse_transform(y_train),le.inverse_transform(train_preds)))

                      precision    recall  f1-score   support

Canceled by Customer       0.31      0.38      0.34      8399
  Canceled by Driver       0.53      0.39      0.45     14747
    Driver Not Found       0.31      0.39      0.34      8099
             Success       1.00      1.00      1.00     51174

            accuracy                           0.77     82419
           macro avg       0.54      0.54      0.53     82419
        weighted avg       0.78      0.77      0.77     82419



# 15. Test data classification report

In [24]:
test_preds = pipeline.predict(X_test)
print(classification_report(le.inverse_transform(y_test),le.inverse_transform(test_preds)))

                      precision    recall  f1-score   support

Canceled by Customer       0.26      0.32      0.29      2100
  Canceled by Driver       0.49      0.36      0.41      3687
    Driver Not Found       0.27      0.33      0.30      2025
             Success       1.00      1.00      1.00     12793

            accuracy                           0.75     20605
           macro avg       0.50      0.50      0.50     20605
        weighted avg       0.76      0.75      0.75     20605



# 16. Train accuracy and test accuracy 

In [25]:
print("Train Accuracy:", accuracy_score(y_train, train_preds))
print("Test Accuracy:", accuracy_score(y_test, test_preds))

Train Accuracy: 0.7671653380895181
Test Accuracy: 0.7498665372482407


# 17. save the model in the models folder 

In [26]:
artifact = {"pipeline":pipeline,"le":le}

with open("../api/models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump(artifact, f)


2.8.1
