In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import polars as pl 
from pathlib import Path
from datetime import datetime
import plotly.express as px

from src.paths import *
from src.logger import get_logger
from src.dwh import run_database_operation


logger = get_logger("dwh")

# Load from DWH

In [3]:
df = run_database_operation(
    operation="fetch_pickup_data",
    from_date=datetime(2022, 1, 1),
    to_date=datetime(2023, 2, 1),
    pickup_locations=[]
)

# Plots

In [4]:
df.head(), df.shape

(shape: (5, 4)
 ┌─────────────────────────────────┬──────────────────────┬────────────────────┬────────────┐
 │ key                             ┆ pickup_datetime_hour ┆ pickup_location_id ┆ num_pickup │
 │ ---                             ┆ ---                  ┆ ---                ┆ ---        │
 │ str                             ┆ datetime[μs]         ┆ i16                ┆ i16        │
 ╞═════════════════════════════════╪══════════════════════╪════════════════════╪════════════╡
 │ 2022-01-01 00:00:00.000000000-1 ┆ 2022-01-01 00:00:00  ┆ 1                  ┆ 0          │
 │ 2022-01-01 00:00:00.000000000-2 ┆ 2022-01-01 00:00:00  ┆ 2                  ┆ 0          │
 │ 2022-01-01 00:00:00.000000000-3 ┆ 2022-01-01 00:00:00  ┆ 3                  ┆ 0          │
 │ 2022-01-01 00:00:00.000000000-4 ┆ 2022-01-01 00:00:00  ┆ 4                  ┆ 11         │
 │ 2022-01-01 00:00:00.000000000-5 ┆ 2022-01-01 00:00:00  ┆ 5                  ┆ 0          │
 └─────────────────────────────────┴─────────

In [22]:
import plotly.graph_objects as go 


def plot_ts(ts_data: pl.DataFrame, series:list[str] = ["num_pickup"], locations: list[int] | None = None, plot_from:datetime = None):
    """
    Plot time-series data
    """
    ts_data_to_plot = ts_data.filter(pl.col("pickup_location_id").is_in(locations)) if locations else ts_data
    ts_data_to_plot =  ts_data_to_plot.filter(pl.col("pickup_datetime_hour").gt(plot_from)) if plot_from else ts_data_to_plot
    ts_data_to_plot = ts_data_to_plot.to_pandas()
    
    fig = go.Figure()

    for serie in series:
        fig.add_trace(go.Scatter(
            x=ts_data_to_plot["pickup_datetime_hour"],
            y=ts_data_to_plot[serie],
            mode="lines",
            name=serie
            # template='none',
            )
        )

    fig.show()
    

In [24]:
plot_ts(df, locations=[43], plot_from=datetime(2023,1,1))

# Feature engineering

from this point onwards, We need the whole data for all batches to generate the correct time features. If we do by month, the data at the beginning will always be lost. This should not be the case

In [5]:
def get_time_lags(df: pl.DataFrame, n_lags: list[int]) -> pl.DataFrame:
    """
    
    Description
    
    Generates time-lagged features for the number of pickups. It receives a list with the lags, in hours, to generate. For example:
    - 1 means 1 hour ago
    - 24 -> 24 hours ago
    - 7*24 -> same hour 7 days ago.

    This function takes a DataFrame and an integer n_lags to generate n_lags new columns in the DataFrame. Each new column represents the number of pickups n hours ago, where n ranges from 1 to n_lags. The function sorts the DataFrame by 'pickup_location_id' and 'pickup_datetime_hour' before shifting to ensure that the lagged values are correctly aligned with the corresponding times and locations.

    Parameters:
    - df (pl.DataFrame): The DataFrame containing the pickup data.
    - n_lags (list[int]): The number of lagged time periods to generate.

    Returns:
    - pl.DataFrame: The original DataFrame with n_lags new columns added, each representing the number of pickups n hours ago.
    """
    return (
        df
        .with_columns([
            pl.col("num_pickup").sort_by(["pickup_location_id", "pickup_datetime_hour"]).shift(i).over("pickup_location_id").alias(f"num_pickup_{i}h_ago") for i in n_lags
        ])
        .drop_nulls()
    )
    
# time_lags = get_time_lags(hourly_df_complete, 3).head()

In [6]:
df_with_lags = (
    df
    .pipe(get_time_lags, [1, 24, 7*24, 2*7*24])
    # .filter(pl.col("pickup_location_id") == 43)
)

df_with_lags.head()

key,pickup_datetime_hour,pickup_location_id,num_pickup,num_pickup_1h_ago,num_pickup_24h_ago,num_pickup_168h_ago,num_pickup_336h_ago
str,datetime[μs],i16,i16,i16,i16,i16,i16
"""2022-01-15 00:…",2022-01-15 00:00:00,1,0,0,0,0,0
"""2022-01-15 00:…",2022-01-15 00:00:00,2,0,0,0,0,0
"""2022-01-15 00:…",2022-01-15 00:00:00,3,0,0,0,0,0
"""2022-01-15 00:…",2022-01-15 00:00:00,4,29,16,7,16,11
"""2022-01-15 00:…",2022-01-15 00:00:00,5,0,0,0,0,0


In [27]:
plot_ts(df_with_lags, series=["num_pickup", "num_pickup_1h_ago", "num_pickup_24h_ago", "num_pickup_168h_ago", "num_pickup_336h_ago"], locations=[43], plot_from=datetime(2023,1,1))

## Baseline model

Average of lags features:
- 1hr ago
- 24hr
- 7 days ago
- 14 weeks ago

Forecast horizon:
- Next 24 hours

In [29]:
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.pipeline import Pipeline

class LagTransformer(BaseEstimator, TransformerMixin): 
    def __init__(self, lags:list[int]):
        self.lags = lags
    
    def fit(self, X:pl.DataFrame, y=None):
        return self
    
    def transform(self, X: pl.DataFrame):
        return (X
        .select([
            (
                pl.col("num_pickup")
                .sort_by(["pickup_location_id", "pickup_datetime_hour"])
                .shift(i)
                .over("pickup_location_id")
                .alias(f"num_pickup_{i}h_ago")
            )
            for i in self.lags
        ])
        .drop_nulls()
    )
        
    def get_feature_names(self) -> list[str]:
        return [f"num_pickup_{i}h_ago" for i in self.lags]
        


class MeanLagPredictor(BaseEstimator, RegressorMixin):
    
        
    def fit(self, X:pl.DataFrame, y=None):
        return self
    
    def predict(self, X:pl.DataFrame) -> pl.DataFrame:
        return (
            X
            .select(
                pl.sum_horizontal(pl.all()) / X.shape[1]
            )
        )


In [30]:
LAGS = [1, 24, 7*24, 2*7*24]

pipeline = Pipeline([
    ("lag_transformer", LagTransformer(LAGS))
    , ("mean_predictor", MeanLagPredictor())
])

pipeline.fit(df)
pipeline.predict(df)

num_pickup_1h_ago
f64
0.0
0.0
0.0
12.5
0.0
0.0
2.5
0.0
0.0
0.75
