In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import polars as pl 
from pathlib import Path
from datetime import datetime


from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

from src.plots import plot_ts
from src.paths import *
from src.config import TODAY_IS, TS_INDEX
from src.dwh import run_database_operation
from src.train import split_into_train_and_test
from src.features import aggregate_to_daily, LagTransformer, get_time_lags

# Load from DWH

In [3]:
df = run_database_operation(
    operation="fetch_pickup_data",
    from_date=datetime(2022, 1, 1),
    to_date=TODAY_IS,
    pickup_locations=[43]
)


df = df.pipe(aggregate_to_daily)
train, test = split_into_train_and_test(df)

In [4]:
train.shape, test.shape, train.head()

((426, 3),
 (90, 3),
 shape: (5, 3)
 ┌────────────────────┬──────────────────────┬────────────┐
 │ pickup_location_id ┆ pickup_datetime_hour ┆ num_pickup │
 │ ---                ┆ ---                  ┆ ---        │
 │ i16                ┆ datetime[μs]         ┆ i64        │
 ╞════════════════════╪══════════════════════╪════════════╡
 │ 43                 ┆ 2022-01-01 00:00:00  ┆ 864        │
 │ 43                 ┆ 2022-01-02 00:00:00  ┆ 991        │
 │ 43                 ┆ 2022-01-03 00:00:00  ┆ 1246       │
 │ 43                 ┆ 2022-01-04 00:00:00  ┆ 1334       │
 │ 43                 ┆ 2022-01-05 00:00:00  ┆ 1306       │
 └────────────────────┴──────────────────────┴────────────┘)

## Baseline model

- Features: 1d, 7d, 14d, 28d
- Prediction: Average of features
- Forecast horizon: Next day

In [5]:




class MeanLagPredictor(BaseEstimator, RegressorMixin):
    
    def __init__(self, index_ts:str = TS_INDEX):
        self.ts_index = index_ts
        
    def fit(self, X:pl.DataFrame, y=None):
        return self
    
    def predict(self, X:pl.DataFrame) -> pl.DataFrame:
        """The -1 is because we remove the
        index column from the average calculation.

        Args:
            X (pl.DataFrame): _description_

        Returns:
            pl.DataFrame: _description_
        """
        return (
            X
            .select(
                pl.col(self.ts_index)
                , (pl.sum_horizontal(pl.exclude([self.ts_index])) / (X.shape[1] - 1)).alias("prediction")
            )
        )


In [6]:

LAGS = [1,7,14,28]

def get_feature_names(lags: list[int], ts_index:str = TS_INDEX) -> list[str]:
    return [f"num_pickup_{i}d_ago" for i in lags] + [ts_index]

(
    train 
    .pipe(get_time_lags, LAGS)
    .select(get_feature_names(LAGS))
    .select(
        pl.col(TS_INDEX)
        , (pl.sum_horizontal(pl.exclude([TS_INDEX])) / len(LAGS)).alias("prediction")
    )
    
)



pickup_datetime_hour,prediction
datetime[μs],f64
2022-01-29 00:00:00,1303.5
2022-01-30 00:00:00,950.0
2022-01-31 00:00:00,1226.0
2022-02-01 00:00:00,1415.25
2022-02-02 00:00:00,1455.5
2022-02-03 00:00:00,1508.25
2022-02-04 00:00:00,1546.5
2022-02-05 00:00:00,1250.75
2022-02-06 00:00:00,1318.5
2022-02-07 00:00:00,1329.0


In [16]:
pipeline = Pipeline([
    ("lag_transformer", LagTransformer(LAGS))
    , ("mean_predictor", MeanLagPredictor())
])

pipeline.fit(train)
predictions = pipeline.predict(train)
test_predictions = pipeline.predict(test)


In [18]:
train_with_predicitions = train.join(predictions, on=TS_INDEX, how="inner")
test_with_predictions = test.join(test_predictions, on=TS_INDEX, how="inner")



plot_ts(train_with_predicitions, ["num_pickup", "prediction"])
plot_ts(test_with_predictions, ["num_pickup", "prediction"])


train_mae = mean_absolute_error(train_with_predicitions["num_pickup"], train_with_predicitions["prediction"])
test_mae = mean_absolute_error(test_with_predictions["num_pickup"], test_with_predictions["prediction"])

print(f"Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")




Train MAE: 187.63, Test MAE: 153.41
