In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pycaret.classification import *
import mlflow
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
import numpy as np
import xgboost

In [2]:
dataset = pd.read_parquet('processed_data.parquet')

In [3]:
dataset.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_hour,dropoff_hour,pickup_day,dropoff_day,pickup_date,dropoff_date,pickup_drop_of,median_trip_distance,long_trip
0,1,2020-03-01 00:31:13,2020-03-01 01:01:42,1.0,4.7,1.0,N,88,255,1,22.0,3.0,0.5,2.0,0.0,0.3,27.8,2.5,0.0,0,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
1,2,2020-03-01 01:06:21,2020-03-01 01:33:20,3.0,6.51,1.0,N,88,255,1,23.5,0.5,0.5,0.0,0.0,0.3,27.3,2.5,0.0,1,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
2,2,2020-03-01 19:07:20,2020-03-01 19:26:55,1.0,5.89,1.0,N,88,255,1,19.5,0.0,0.5,4.56,0.0,0.3,27.36,2.5,0.0,19,19,1,1,2020-03-01,2020-03-01,88_255,4.8,0
3,1,2020-03-02 15:33:53,2020-03-02 16:14:23,2.0,7.3,1.0,N,88,255,1,31.5,2.5,0.5,8.65,0.0,0.3,43.45,2.5,0.0,15,16,2,2,2020-03-02,2020-03-02,88_255,4.8,0
4,2,2020-03-02 19:47:15,2020-03-02 20:07:43,5.0,4.81,1.0,N,88,255,1,18.0,1.0,0.5,4.46,0.0,0.3,26.76,2.5,0.0,19,20,2,2,2020-03-02,2020-03-02,88_255,4.8,0


## Filtered payment_type == 1

In [4]:
dataset = dataset[dataset.payment_type == 1].reset_index(drop=True)

In [5]:
dataset['tip'] = dataset['tip_amount'].map(lambda x: 1 if x>0 else 0)

In [6]:
x_columns = [
    'passenger_count', 'PULocationID', 'DOLocationID',
    'congestion_surcharge', 'pickup_day', 'pickup_hour', 'fare_amount', 'extra', 'mta_tax', 'tolls_amount', 'airport_fee',   'tip']

In [7]:
data = dataset[x_columns]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, data['tip'],
                                                    stratify=data['tip'], 
                                                    test_size=0.15, random_state=42)

In [9]:
X_train['tip'] = y_train
X_test['tip'] = y_test

In [10]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [11]:
X_train.shape

(14655215, 12)

In [12]:
X_test.shape

(2586215, 12)

In [13]:
X_train['tip'].value_counts() 

1    14190870
0      464345
Name: tip, dtype: int64

In [14]:
X_train.shape

(14655215, 12)

In [15]:
X_train_sample = pd.concat([
    X_train[X_train.tip == 0], 
    X_train[X_train.tip == 1].sample(2000000),
])

In [16]:
X_train_sample.shape

(2464345, 12)

In [17]:
clf = setup(
    X_train_sample,
    target='tip',
    log_experiment = True,
    experiment_name = 'yellow_taxis_tip',
    silent=True)

Unnamed: 0,Description,Value
0,session_id,6830
1,Target,tip
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(2464345, 12)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,1
8,Ordinal Features,False
9,High Cardinality Features,False


In [18]:
add_metric('prauc', 'PR AUC', metrics.average_precision_score, greater_is_better = True)

Name                                                            PR AUC
Display Name                                                    PR AUC
Score Function       <function average_precision_score at 0x7fe6dca...
Scorer                            make_scorer(average_precision_score)
Target                                                            pred
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: prauc, dtype: object

In [19]:
best_models = compare_models(['lightgbm'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8268,0.7054,0.9819,0.8341,0.902,0.2003,0.2679,0.8337,6.531


In [20]:
lightgbm_model = create_model('lightgbm',fold=10)
tuned_lightgbm = tune_model(lightgbm_model, optimize = 'prauc',fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8285,0.7105,0.9776,0.838,0.9024,0.2273,0.2865,0.8374
1,0.8304,0.7188,0.9778,0.8396,0.9034,0.239,0.2986,0.8389
2,0.8293,0.7142,0.9781,0.8384,0.9029,0.2306,0.2908,0.8378
3,0.8302,0.7131,0.9782,0.8391,0.9034,0.2363,0.2968,0.8385
4,0.8306,0.7176,0.979,0.8391,0.9036,0.2364,0.2983,0.8385
5,0.8293,0.7128,0.9776,0.8387,0.9029,0.2326,0.2919,0.8381
6,0.829,0.7115,0.9776,0.8384,0.9027,0.2304,0.2897,0.8378
7,0.8289,0.7146,0.9777,0.8383,0.9026,0.2295,0.289,0.8377
8,0.8288,0.7145,0.9776,0.8383,0.9026,0.2292,0.2885,0.8377
9,0.83,0.7206,0.9782,0.839,0.9033,0.2353,0.2957,0.8384


In [21]:
preds_lightgbm = predict_model(tuned_lightgbm, data = X_test, raw_score=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
0,Light Gradient Boosting Machine,0.9529,0.7158,0.9778,0.9737,0.9757,0.1816,0.182,0.9736


In [22]:
final_model = finalize_model(tuned_lightgbm)
save_model(final_model, 'predict_tip')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='tip',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strateg...
                                 colsample_bytree=1.0, feature_fraction=0.7,
                                 importance_type='split', learning_rate=0.3,
                                 max_depth=-1, min_child_samples=56,
                                 min_child_weight=0

In [51]:
xgboost_model = create_model('xgboost',fold=10)
tuned_xgboost = tune_model(xgboost_model, optimize = 'prauc',fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.8152,0.6926,0.9982,0.8154,0.8976,0.0417,0.1253,0.8154
1,0.8158,0.6949,0.9982,0.8159,0.8979,0.047,0.1355,0.8159
2,0.8146,0.6975,0.9986,0.8147,0.8973,0.0346,0.115,0.8147
3,0.8156,0.6956,0.9982,0.8157,0.8978,0.0447,0.1315,0.8157
4,0.8159,0.6966,0.998,0.8161,0.8979,0.0484,0.1367,0.8161
5,0.8161,0.6923,0.9982,0.8161,0.898,0.0491,0.1392,0.8161
6,0.8154,0.6929,0.9986,0.8154,0.8977,0.0416,0.1288,0.8154
7,0.8152,0.6959,0.9981,0.8155,0.8976,0.0423,0.126,0.8155
8,0.8149,0.6986,0.9986,0.815,0.8975,0.0372,0.1199,0.8149
9,0.8161,0.6994,0.998,0.8162,0.898,0.0499,0.139,0.8162


In [52]:
preds_xgboost = predict_model(tuned_xgboost, data = X_test, raw_score=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
0,Extreme Gradient Boosting,0.9677,0.6985,0.9986,0.969,0.9836,0.0427,0.0885,0.969




In [57]:
tuned_xgboost

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=10,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=220, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=6830, reg_alpha=0.01,
              reg_lambda=0.4, scale_pos_weight=37.4, subsample=0.2,
              tree_method='auto', use_label_encoder=True, validate_parameters=1,
              verbosity=0)

In [None]:
final_model_xgboost = finalize_model(tuned_xgboost)
save_model(final_model_xgboost, 'predict_tip_xgboost')

In [None]:
saved_model = load_model('predict_tip_xgboost')

In [60]:
X_test_sample = X_test.sample(100000).reset_index(drop=True)

In [61]:
X_test_sample_pipe = saved_model[:-1].transform(X_test_sample)

In [62]:
explainer = shap.TreeExplainer(saved_model.named_steps["trained_model"])
shap_values = explainer.shap_values(X_test_sample_pipe)

In [None]:
idx = 0
shap.force_plot(explainer.expected_value, shap_values[idx,:], X_test_sample_pipe.iloc[idx,:])