## LightGBM with Feature Engineering

In [67]:
# load the data
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR /'tabular_data.parquet')

In [68]:
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0


The first 3 drop off locations have almost no rides so I want to remove them from the data. 

In [69]:
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0


In [70]:
# train and test sets
from src.data_split import train_test_split
from datetime import datetime

# create training and testing sets
X_train, y_train, X_test, y_test = train_test_split(df, cutoff_date=datetime(2022, 6, 1, 0, 0, 0), 
                                                    target_column_name= 'target_rides_next_hour')

# view shapes
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')


X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


We are going to use Feature Engineering and Scikit-Learn pipelines for our data. The first feature engineering step is to add a feature/column to our data for the previous four week average rides. This was also done as a baseline model. Since we want this to be part of the pipeline, we need to use **FunctionTransformer** if we want it to be a step included in our pipeline.

In [71]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Adds one column with the average rides from
    - 7 days ago
    - 14 days ago
    - 21 days ago
    - 28 days ago
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

In [72]:
from sklearn.preprocessing import FunctionTransformer

add_feature_previous_4_average = FunctionTransformer(average_rides_last_four_weeks, validate = False)

Now we will use the **Custom Transformer** capability from sklearn to create a column for the hour of the day and the day of the week.

*note: the column for the hour of the day will always be hour 0 because we sliced every 24 hours. The reason we are including it in our analysis is incase we want to change the slicing in the future to predict every hour.*

In [77]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):  # we have no parameters so nothing has to be done
        return self
    def transform(self, X, y = None):
        # make a copy of X since we are dropping a row
        X_ = X.copy()

        # hour of day and day of week
        X_['hour'] = X_['pickup_hour'].dt.hour
        X_['day'] = X_['pickup_hour'].dt.dayofweek

        # using lightGBM so we have to drop the datetime column
        return X_.drop(columns = ['pickup_hour'])

In [78]:
# implement previous two steps
add_feature_previous_4_average.fit_transform(X_train)
add_temporal_features = TemporalFeatures()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_location_id,average_rides_last_4_weeks,hour,day
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1,0.00,0,5
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1,0.00,0,6
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,0.0,1,0.00,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1,0.00,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.00,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32590,10.0,7.0,4.0,5.0,10.0,4.0,9.0,15.0,15.0,13.0,...,12.0,8.0,8.0,4.0,5.0,7.0,265,7.00,0,4
32591,6.0,5.0,5.0,0.0,6.0,4.0,1.0,3.0,3.0,10.0,...,20.0,9.0,5.0,6.0,7.0,2.0,265,7.25,0,5
32592,8.0,6.0,2.0,5.0,4.0,3.0,2.0,3.0,15.0,14.0,...,6.0,5.0,3.0,3.0,11.0,5.0,265,8.75,0,6
32593,4.0,2.0,5.0,3.0,1.0,7.0,15.0,18.0,17.0,14.0,...,8.0,13.0,11.0,5.0,4.0,4.0,265,6.00,0,0


In [79]:
# put all steps together with a pipeline
from sklearn.pipeline import Pipeline
import lightgbm as lgb

pipeline = Pipeline([
    ('prev_4', add_feature_previous_4_average),
    ('temporal_features', TemporalFeatures()),
    ('lgb', lgb.LGBMRegressor()),
])

# fit transform
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.111565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32595, number of used features: 675
[LightGBM] [Info] Start training from score 11.571069


In [80]:
from sklearn.metrics import mean_absolute_error
# make predictions 
predictions = pipeline.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)

# print the results
print(f'{test_mae=:.4f}')

test_mae=2.5947
