## LightGBM with Feature Engineering

In [1]:
# load the data
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR /'tabular_data.parquet')

In [2]:
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0


In [4]:
# train and test sets
from src.data_split import train_test_split
from datetime import datetime

# create training and testing sets
X_train, y_train, X_test, y_test = train_test_split(df, cutoff_date=datetime(2022, 6, 1, 0, 0, 0), 
                                                    target_column_name= 'target_rides_next_hour')

# view shapes
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')


X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


We are going to use Feature Engineering and Scikit-Learn pipelines for our data. The first feature engineering step is to add a feature/column to our data for the previous four week average rides. This was also done as a baseline model. Since we want this to be part of the pipeline, we need to use **FunctionTransformer** if we want it to be a step included in our pipeline.

In [6]:
# add average rides last 4 weeks
def average_rides_last_four_weeks(X: pd.DataFrame) -> pd.DataFrame:
    '''Add feature that is the average of previous four weeks.'''
    X['average_last_four_weeks'] = (X[f'rides_previous_{7*24}'] + X[f'rides_previous_{14*24}'] + 
                                    X[f'rides_previous_{21*24}'] + X[f'rides_previous_{28*24}']) / 4 
    return X

In [7]:
from sklearn.preprocessing import FunctionTransformer

add_feature_previous_4_average = FunctionTransformer(average_rides_last_four_weeks, validate = False)