In [30]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import LassoCV, RidgeCV
from utils import *
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import root_mean_squared_error

In [50]:
df_train = pd.read_parquet("data/nyc-taxis-tips/train_preprocessed_no_hot.parquet")
X = df_train.drop("tip_amount", axis=1)
Y = df_train['tip_amount']

columns_to_norm = [
    'passenger_count',
    'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tolls_amount',
    'improvement_surcharge', 'congestion_surcharge', 'Airport_fee','duration'
]
categorical_cols = [
    "VendorID", "RatecodeID", "store_and_fwd_flag", "payment_type", "PU_borough", "DO_borough"
]
cols_to_exclude = [
    'PU_location_lat', 'PU_location_lon', 'DO_location_lat',
    'DO_location_lon', 'PU_loc', 'DO_loc',
]
X = X.drop(cols_to_exclude, axis=1)
X[categorical_cols] = X[categorical_cols].astype('category')
display(X.columns)

scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
X_train[columns_to_norm] = scaler.fit_transform(X_train[columns_to_norm])
X_val[columns_to_norm] = scaler.transform(X_val[columns_to_norm])
X_test[columns_to_norm] = scaler.transform(X_test[columns_to_norm])

print(X.shape, Y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_val.shape, y_val.shape)

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'store_and_fwd_flag', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',
       'Airport_fee', 'is_rush_hour', 'duration', 'PU_borough', 'DO_borough'],
      dtype='object')

(100000, 17) (100000,)
(72250, 17) (72250,)
(15000, 17) (15000,)
(12750, 17) (12750,)


In [39]:
print("Starting training...")
# train
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])

print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f"The RMSE of prediction is: {rmse_test}")

# feature importances
print(f"Feature importances: {list(gbm.feature_importances_)}")


# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return "RMSLE", np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print("Starting training with custom eval function...")
# train
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=rmsle, callbacks=[lgb.early_stopping(5)])


# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: str, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return "RAE", np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False


print("Starting training with multiple custom eval functions...")
# train
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=[rmsle, rae], callbacks=[lgb.early_stopping(5)])

print("Starting predicting...")
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f"The RMSLE of prediction is: {rmsle_test}")
print(f"The RAE of prediction is: {rae_test}")

# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {"n_estimators": [35, 40, 50, 70, 100], "num_leaves": [5, 10, 15, 20, 31, 45]}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print(f"Best parameters found by grid search are: {gbm.best_params_}")

Starting training...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002757 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870
[LightGBM] [Info] Number of data points in the train set: 72250, number of used features: 16
[LightGBM] [Info] Start training from score 3.424159
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 1.38789	valid_0's l2: 5.45149
Starting predicting...
The RMSE of prediction is: 2.334842475988417
Feature importances: [9, 6, 37, 40, 0, 40, 272, 52, 4, 64, 0, 11, 4, 0, 52, 0, 9]
Starting training with custom eval function...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=t

In [40]:
gbm = lgb.LGBMRegressor(num_leaves=10, learning_rate=0.1, n_estimators=50)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", callbacks=[lgb.early_stopping(5)])

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f"The RMSLE of prediction is: {rmsle_test}")
print(f"The RAE of prediction is: {rae_test}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005008 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870
[LightGBM] [Info] Number of data points in the train set: 72250, number of used features: 16
[LightGBM] [Info] Start training from score 3.424159
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[40]	valid_0's l1: 1.01776	valid_0's l2: 4.16857
The RMSLE of prediction is: 0.39422214536374506
The RAE of prediction is: 0.4235607546255953


In [41]:
# Calcul du Mean Squared Error (MSE) et du R2 Score
mse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Affichage des résultats
print("Mean Squared Error (MSE):", mse)
print("Coefficient de détermination (R2 Score):", r2)

Mean Squared Error (MSE): 2.041706596879121
Coefficient de détermination (R2 Score): 0.7061151554054528


In [51]:
# test data for Kaggle submission
test = pd.read_parquet("data/nyc-taxis-tips/test_preprocessed_no_hot.parquet")

# test.drop(columns=["RatecodeID_6.0"], inplace=True)
test[categorical_cols] = test[categorical_cols].astype('category')

X_sub = test.drop(cols_to_exclude, axis=1)
X_sub[columns_to_norm] = scaler.transform(X_sub[columns_to_norm])

#predictions = model_ridge_CV.predict(X_sub)
predictions = gbm.predict(X_sub)

df_pred = pd.DataFrame(predictions, columns=["tip_amount"]).reset_index().rename(columns={"index": "row_ID"})
display(df_pred.head(15))
print(sum(df_pred["tip_amount"]<0))
df_pred.loc[df_pred.tip_amount < 0, "tip_amount"] = 0
#df_pred.to_parquet("submission/nyc-taxis-tips/regression_sub_1bis.parquet", index=False)
df_pred.to_parquet("submission/nyc-taxis-tips/boosting_sub_1.parquet", index=False)

Unnamed: 0,row_ID,tip_amount
0,0,2.07627
1,1,2.685867
2,2,0.038594
3,3,2.07627
4,4,0.075244
5,5,6.892487
6,6,4.211918
7,7,0.090765
8,8,3.112959
9,9,9.692842


13982


In [26]:
test.columns

Index(['passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',
       'Airport_fee', 'PU_location_lat', 'PU_location_lon', 'DO_location_lat',
       'DO_location_lon', 'is_rush_hour', 'duration', 'PU_loc', 'DO_loc',
       'VendorID_1', 'VendorID_2', 'RatecodeID_1.0', 'RatecodeID_2.0',
       'RatecodeID_3.0', 'RatecodeID_4.0', 'RatecodeID_5.0', 'RatecodeID_99.0',
       'store_and_fwd_flag_N', 'store_and_fwd_flag_Y', 'payment_type_1',
       'payment_type_2', 'payment_type_3', 'payment_type_4', 'PU_borough_7',
       'PU_borough_1', 'PU_borough_3', 'PU_borough_4', 'PU_borough_2',
       'PU_borough_6', 'PU_borough_5', 'DO_borough_7', 'DO_borough_1',
       'DO_borough_3', 'DO_borough_4', 'DO_borough_2', 'DO_borough_6',
       'DO_borough_5'],
      dtype='object')

In [46]:
test["RatecodeID"].unique()

array([ 1.,  2.,  3., 99.,  5.,  4.,  6.])

In [47]:
X["RatecodeID"].unique()

[1.0, 2.0, 3.0, 5.0, 99.0, 4.0]
Categories (6, float64): [1.0, 2.0, 3.0, 4.0, 5.0, 99.0]