In [2]:
import pandas as pd 
from datetime import datetime 

from src.paths import TRANSFORMED_DATA
from src.data_splitting import train_test_split

from src.feature_engineering import (
    average_trips_last_4_weeks, TemporalFeatureEngineeringStarts, TemporalFeatureEngineeringStops
)

## Train-test Splits

### Starts

In [3]:
starts = pd.read_parquet(path = TRANSFORMED_DATA/"start_table.parquet")

In [4]:
start_x_train, start_y_train, start_x_test, start_y_test = train_test_split(
    data=starts, 
    scenario="start",
    cutoff_date=datetime(2023,6,1,0,0,0),
    target_column="trips_next_hour"
)

print(f"{start_x_train.shape=}")
print(f"{start_y_train.shape=}")
print(f"{start_x_test.shape=}")
print(f"{start_y_test.shape=}")

start_x_train.shape=(225090, 674)
start_y_train.shape=(225090,)
start_x_test.shape=(389790, 674)
start_y_test.shape=(389790,)


### Stops

In [3]:
stops = pd.read_parquet(path = TRANSFORMED_DATA/"stop_table.parquet")

In [4]:
stop_x_train, stop_y_train, stop_x_test, stop_y_test = train_test_split(
    data=stops, 
    scenario="stop",
    cutoff_date=datetime(2023,6,1,0,0,0),
    target_column="trips_next_hour"
)

print(f"{stop_x_train.shape=}")
print(f"{stop_y_train.shape=}")
print(f"{stop_x_test.shape=}")
print(f"{stop_y_test.shape=}")

stop_x_train.shape=(152520, 674)
stop_y_train.shape=(152520,)
stop_x_test.shape=(265360, 674)
stop_y_test.shape=(265360,)


## Implementing Feature Engineering

### Starts

In [6]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_last_4_weeks = FunctionTransformer(func = average_trips_last_4_weeks, validate = False)
add_temporal_features_to_starts = TemporalFeatureEngineeringStarts()
add_temporal_features_to_stops = TemporalFeatureEngineeringStops()

In [8]:
import lightgbm as lgb 
from sklearn.pipeline import make_pipeline

start_pipeline = make_pipeline(
    add_feature_average_last_4_weeks, 
    add_temporal_features_to_starts,
    lgb.LGBMRegressor()
)

start_pipeline.fit(start_x_train, start_y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.553762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11926
[LightGBM] [Info] Number of data points in the train set: 225090, number of used features: 675
[LightGBM] [Info] Start training from score 0.068897


In [11]:
start_predictions = start_pipeline.predict(start_x_test)

from sklearn.metrics import mean_absolute_error
start_test_mae = mean_absolute_error(start_y_test, start_predictions)
start_test_mae

0.1505828405528988

### Stops

In [10]:
stop_pipeline = make_pipeline(
    add_feature_average_last_4_weeks, 
    add_temporal_features_to_stops,
    lgb.LGBMRegressor()
)

stop_pipeline.fit(stop_x_train, stop_y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.362057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14003
[LightGBM] [Info] Number of data points in the train set: 152520, number of used features: 675
[LightGBM] [Info] Start training from score 0.110792


In [12]:
stop_predictions = stop_pipeline.predict(stop_x_test)

from sklearn.metrics import mean_absolute_error
stop_test_mae = mean_absolute_error(stop_y_test, stop_predictions)
stop_test_mae

0.2279303243757683