In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pycaret.classification import *
import mlflow
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
import numpy as np
import xgboost

In [2]:
dataset = pd.read_parquet('processed_data.parquet')

In [3]:
dataset.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_hour,dropoff_hour,pickup_day,dropoff_day,pickup_date,dropoff_date,pickup_drop_of,median_trip_distance,long_trip
0,1,2020-03-01 00:31:13,2020-03-01 01:01:42,1.0,4.7,1.0,N,88,255,1,22.0,3.0,0.5,2.0,0.0,0.3,27.8,2.5,0.0,0,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
1,2,2020-03-01 01:06:21,2020-03-01 01:33:20,3.0,6.51,1.0,N,88,255,1,23.5,0.5,0.5,0.0,0.0,0.3,27.3,2.5,0.0,1,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
2,2,2020-03-01 19:07:20,2020-03-01 19:26:55,1.0,5.89,1.0,N,88,255,1,19.5,0.0,0.5,4.56,0.0,0.3,27.36,2.5,0.0,19,19,1,1,2020-03-01,2020-03-01,88_255,4.8,0
3,1,2020-03-02 15:33:53,2020-03-02 16:14:23,2.0,7.3,1.0,N,88,255,1,31.5,2.5,0.5,8.65,0.0,0.3,43.45,2.5,0.0,15,16,2,2,2020-03-02,2020-03-02,88_255,4.8,0
4,2,2020-03-02 19:47:15,2020-03-02 20:07:43,5.0,4.81,1.0,N,88,255,1,18.0,1.0,0.5,4.46,0.0,0.3,26.76,2.5,0.0,19,20,2,2,2020-03-02,2020-03-02,88_255,4.8,0


In [4]:
dataset['tip'] = dataset['tip_amount'].map(lambda x: 1 if x>0 else 0)

In [5]:
x_columns = [
    'VendorID', 'RatecodeID', 'passenger_count', 'PULocationID', 'DOLocationID',
    'congestion_surcharge', 'pickup_day', 'pickup_hour', 'fare_amount', 'extra', 'mta_tax', 'tolls_amount', 'airport_fee',   'tip']

In [6]:
data = dataset[x_columns]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, data['tip'],
                                                    stratify=data['tip'], 
                                                    test_size=0.15, random_state=42)

In [8]:
X_train['tip'] = y_train
X_test['tip'] = y_test

In [9]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [10]:
X_train.shape

(20629038, 14)

In [11]:
X_test.shape

(3640419, 14)

In [12]:
X_train['tip'].value_counts() 

1    14604290
0     6024748
Name: tip, dtype: int64

In [13]:
X_train_sample = X_train.sample(1000000)

In [None]:
clf = setup(
    X_train_sample,
    target='tip',
    log_experiment = True,
    experiment_name = 'yellow_taxis_tip',
    silent=True)

In [None]:
add_metric('prauc', 'PR AUC', metrics.average_precision_score, greater_is_better = True) 

In [None]:
best_models = compare_models(['lightgbm'])

In [None]:
lightgbm_model = create_model('lightgbm',fold=10)
tuned_lightgbm = tune_model(lightgbm_model, optimize = 'prauc',fold=10)

In [None]:
preds_lightgbm = predict_model(tuned_lightgbm, data = X_test, raw_score=True)

In [None]:
final_model = finalize_model(tuned_lightgbm)
save_model(final_model, 'predict_tip')

In [14]:
import shap

In [15]:
saved_model = load_model('predict_tip')

Transformation Pipeline and Model Successfully Loaded


In [16]:
test_pipe = saved_model[:-1].transform(X_test)

In [17]:
test_pipe

Unnamed: 0,RatecodeID,PULocationID,DOLocationID,congestion_surcharge,pickup_day,pickup_hour,fare_amount,extra,mta_tax,tolls_amount,VendorID_2,VendorID_5,VendorID_6,passenger_count_0.0,passenger_count_1.0,passenger_count_2.0,passenger_count_3.0,passenger_count_4.0,passenger_count_5.0,passenger_count_6.0,passenger_count_9.0,passenger_count_not_available,airport_fee_1.0
0,1.0,107.0,162.0,2.5,16.0,16.0,8.0,0.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,246.0,170.0,2.5,21.0,18.0,18.0,1.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,211.0,211.0,2.5,1.0,2.0,4.0,0.5,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,68.0,71.0,2.5,11.0,19.0,40.0,0.0,0.5,6.12,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,164.0,211.0,2.5,9.0,12.0,9.5,2.5,0.5,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3640414,1.0,239.0,239.0,2.5,21.0,11.0,6.5,0.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3640415,1.0,186.0,143.0,2.5,15.0,17.0,11.0,1.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3640416,1.0,239.0,238.0,2.5,8.0,19.0,7.5,1.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3640417,1.0,239.0,143.0,2.5,12.0,9.0,7.0,0.0,0.5,0.00,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
explainer = shap.TreeExplainer(saved_model.named_steps["trained_model"])

In [None]:
shap_values = explainer.shap_values(test_pipe)

In [None]:
idx = 0
shap.force_plot(explainer.expected_value, shap_values[idx,:], test_pipe.iloc[idx,:])