In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pycaret.classification import *
import mlflow
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
import numpy as np
import xgboost

In [2]:
dataset = pd.read_parquet('processed_data.parquet')

In [3]:
dataset.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_hour,dropoff_hour,pickup_day,dropoff_day,pickup_date,dropoff_date,pickup_drop_of,median_trip_distance,long_trip
0,1,2020-03-01 00:31:13,2020-03-01 01:01:42,1.0,4.7,1.0,N,88,255,1,22.0,3.0,0.5,2.0,0.0,0.3,27.8,2.5,0.0,0,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
1,2,2020-03-01 01:06:21,2020-03-01 01:33:20,3.0,6.51,1.0,N,88,255,1,23.5,0.5,0.5,0.0,0.0,0.3,27.3,2.5,0.0,1,1,1,1,2020-03-01,2020-03-01,88_255,4.8,0
2,2,2020-03-01 19:07:20,2020-03-01 19:26:55,1.0,5.89,1.0,N,88,255,1,19.5,0.0,0.5,4.56,0.0,0.3,27.36,2.5,0.0,19,19,1,1,2020-03-01,2020-03-01,88_255,4.8,0
3,1,2020-03-02 15:33:53,2020-03-02 16:14:23,2.0,7.3,1.0,N,88,255,1,31.5,2.5,0.5,8.65,0.0,0.3,43.45,2.5,0.0,15,16,2,2,2020-03-02,2020-03-02,88_255,4.8,0
4,2,2020-03-02 19:47:15,2020-03-02 20:07:43,5.0,4.81,1.0,N,88,255,1,18.0,1.0,0.5,4.46,0.0,0.3,26.76,2.5,0.0,19,20,2,2,2020-03-02,2020-03-02,88_255,4.8,0


In [4]:
x_columns = [
    'VendorID', 'RatecodeID', 'passenger_count', 'PULocationID', 'DOLocationID',
    'congestion_surcharge', 'pickup_day', 'pickup_hour', 'long_trip']

In [5]:
data = dataset[x_columns]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, data['long_trip'],
                                                    stratify=data['long_trip'], 
                                                    test_size=0.15, random_state=42)

In [7]:
X_train['long_trip'] = y_train
X_test['long_trip'] = y_test

In [8]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [9]:
X_train.long_trip.value_counts()

0    20617970
1       11068
Name: long_trip, dtype: int64

In [10]:
X_train_sample = pd.concat([
    X_train[X_train.long_trip == 1], 
    X_train[X_train.long_trip == 0].sample(1000000),
])

In [11]:
X_train_sample.long_trip.value_counts()

0    1000000
1      11068
Name: long_trip, dtype: int64

In [12]:
clf = setup(
    X_train_sample,
    target='long_trip',
    log_experiment = True,
    experiment_name = 'yellow_taxis_long_short',
    silent=True)

Unnamed: 0,Description,Value
0,session_id,8660
1,Target,long_trip
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(1011068, 9)"
5,Missing Values,True
6,Numeric Features,6
7,Categorical Features,2
8,Ordinal Features,False
9,High Cardinality Features,False


In [13]:
add_metric('prauc', 'PR AUC', metrics.average_precision_score, greater_is_better = True)

Name                                                            PR AUC
Display Name                                                    PR AUC
Score Function       <function average_precision_score at 0x7fc58c1...
Scorer                            make_scorer(average_precision_score)
Target                                                            pred
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: prauc, dtype: object

In [14]:
best_models = compare_models(['lightgbm'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.995,0.9848,0.7218,0.8016,0.7595,0.7569,0.7581,0.5816,4.197


In [15]:
lightgbm_model = create_model('lightgbm',fold=10)
tuned_lightgbm = tune_model(lightgbm_model, optimize = 'prauc',fold=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9956,0.9935,0.7179,0.8615,0.7832,0.781,0.7843,0.6216
1,0.9954,0.9933,0.7372,0.8297,0.7807,0.7784,0.7798,0.6146
2,0.9955,0.9954,0.7359,0.8404,0.7847,0.7825,0.7842,0.6214
3,0.9956,0.9945,0.7218,0.8609,0.7852,0.783,0.7861,0.6244
4,0.9955,0.9951,0.7298,0.8432,0.7824,0.7802,0.7823,0.6184
5,0.9956,0.9917,0.7414,0.8416,0.7883,0.7861,0.7877,0.6268
6,0.9959,0.9925,0.7503,0.8618,0.8022,0.8001,0.8021,0.6494
7,0.9955,0.9961,0.7282,0.844,0.7818,0.7796,0.7817,0.6176
8,0.9956,0.9953,0.741,0.8426,0.7885,0.7863,0.788,0.6272
9,0.9955,0.9933,0.7449,0.8324,0.7862,0.7839,0.7852,0.6228


In [16]:
preds_lightgbm = predict_model(tuned_lightgbm, data = X_test, raw_score=True)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR AUC
0,Light Gradient Boosting Machine,0.9984,0.9945,0.7179,0.2105,0.3255,0.325,0.3882,0.1512
