In [1]:

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Model Building

In [3]:
#importing the necessary packages.
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, classification_report
#from scikitplot.metrics import plot_confusion_matrix
#from scikitplot.classifiers import plot_feature_importances

#importing model packages.
import xgboost as xgb
import lightgbm as lgb
import mlflow

In [3]:
# Set the MLflow experiment name
mlflow.set_experiment("XGBoost_GridSearchCV_Experiment")

<Experiment: artifact_location='file:///Users/madhuri/Documents/instacart_analysis/mlruns/409450851093224200', creation_time=1741387367950, experiment_id='409450851093224200', last_update_time=1741387367950, lifecycle_stage='active', name='XGBoost_GridSearchCV_Experiment', tags={}>

In [5]:
data_train = pd.read_csv('./data/train_data.csv')

In [6]:
#Creating X and y variables.
X = data_train.drop(['reordered', 'uxp_ratio_last_five'], axis=1)
y = data_train.reordered

#splitting dataset into train and test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

why the column uxp ratio last five is dropped?

In [7]:
X_test

Unnamed: 0,uxp_times_bought,uxp_reorder_ratio,uxp_last_five,u_num_of_orders,dow_u_most_orders,hod_u_most_orders,u_reorder_ratio,avg_days_between_orders,u_avg_prd,u_total_iems_bought,p_num_of_times,p_reorder_ratio,p_avg_cart_position,aisle_id,department_id
4002577,1,0.500000,1.0,60,6,15,0.792233,4.323301,17.166667,1030,11165,0.701120,7.856068,0.130793,0.110109
1829944,1,0.043478,0.0,25,3,12,0.693642,14.568401,20.760000,519,5943,0.683661,7.991082,0.103269,0.128506
2402250,1,0.111111,0.0,18,0,23,0.675000,10.518182,24.444444,440,2222,0.389289,9.414041,0.103269,0.128506
82405,5,0.500000,2.0,19,1,18,0.663462,15.858974,16.421053,312,9113,0.337101,9.369911,0.038712,0.037791
5425927,1,0.111111,0.0,9,4,14,0.466102,19.296610,13.111111,118,2652,0.659879,8.028281,0.109664,0.104196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7950376,1,0.043478,0.0,26,5,13,0.771676,13.101156,13.307692,346,345,0.356522,9.568116,0.099790,0.114196
1917199,2,0.285714,1.0,7,5,18,0.384615,27.538462,13.000000,91,6572,0.626141,8.305234,0.118036,0.091813
3406030,1,0.166667,0.0,15,1,12,0.517483,13.195804,9.533333,143,1221,0.425880,8.938575,0.066931,0.086078
3265577,1,0.019608,0.0,81,0,20,0.697743,5.524043,12.580247,1019,492,0.638211,7.154472,0.081223,0.110109


In [6]:
# Define the XGBoost model
model = xgb.XGBClassifier(random_state=42)

In [7]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 150],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

In [8]:
# Start an MLflow run
with mlflow.start_run():
    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1',
        cv=5,
        verbose=1,
        n_jobs=-1
    )

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Log the best parameters and metrics
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_f1_score", best_score)

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_f1_score = f1_score(y_test, y_pred)

    mlflow.log_metric("test_f1_score", test_f1_score)

    # Log the best model
    mlflow.xgboost.log_model(best_model, "best_model")

    print(f"Best CV F1 Score: {best_score}")
    print(f"Test F1 Score: {test_f1_score}")
    print(f"Best Parameters: {best_params}")

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Best CV F1 Score: 0.2679153847536591
Test F1 Score: 0.26590361025816533
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}


In [None]:
logged_model = 'runs:/8cd01ee0e7634f509fe6e1b669b7b426/best_model'


# Load the model directly as an XGBoost model
xgboost_model = mlflow.xgboost.load_model(logged_model)
# Predict probabilities on the test set
y_proba = xgboost_model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Apply a threshold to convert probabilities to binary predictions
threshold = 0.21
y_pred = (y_proba >= threshold).astype('int')

# Evaluation
print('F1 Score: {}'.format(f1_score(y_test, y_pred)))  # Correct order: (y_true, y_pred)
print(classification_report(y_test, y_pred))

F1 Score: 0.4308267960906295
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.93   2294232
         1.0       0.38      0.50      0.43    248167

    accuracy                           0.87   2542399
   macro avg       0.66      0.70      0.68   2542399
weighted avg       0.89      0.87      0.88   2542399



why is the threshold set to 0.21? - maybe business objective
but why are we considering only positive class probabilities?

In [12]:
xgboost_model.predict_proba(X_test)

array([[0.877651  , 0.12234903],
       [0.9868439 , 0.0131561 ],
       [0.9753787 , 0.0246213 ],
       ...,
       [0.97221863, 0.02778138],
       [0.99433416, 0.00566586],
       [0.9684475 , 0.03155249]], dtype=float32)