In [1]:
%reload_ext autoreload
%autoreload 2

In [51]:
import polars as pl 
from pathlib import Path
from datetime import datetime

from src.paths import *
from src.logger import get_logger
from src.data import read_file


logger = get_logger()

# Load processed file

In [3]:
# def read_file(folder:Path, year:int, month:int) -> pl.DataFrame:
#     """
#     Reads a parquet file for a given year and month from a specified folder.

#     This function constructs the file path using the provided folder, year, and month. It then reads the parquet file
#     located at that path into a Polars DataFrame and returns it.

#     Parameters:
#     - folder (Path): The folder where the parquet file is located.
#     - year (int): The year part of the file to be read.
#     - month (int): The month part of the file to be read.

#     Returns:
#     - pl.DataFrame: The Polars DataFrame containing the data from the parquet file.
#     """
#     return pl.read_parquet(folder / FILE_PATTERN.format(year=year, month=month))

df = read_file(PROCESSED_DATA_DIR, 2020, 1)

In [4]:
df.head()

pickup_datetime,pickup_location_id
datetime[ns],i64
2020-01-01 00:28:15,238
2020-01-01 00:35:39,239
2020-01-01 00:47:41,238
2020-01-01 00:55:23,238
2020-01-01 00:01:58,193


In [20]:
def generate_hourly_datetimes_with_ranges(year: int, month: int) -> pl.DataFrame:
    """
    Generates a Polars DataFrame with a single column containing datetimes for every hour in the specified month
    using the pl.datetime_ranges function.

    Parameters:
    - year (int): The year of the month for which to generate hourly datetimes.
    - month (int): The month for which to generate hourly datetimes.

    Returns:
    - pl.DataFrame: A DataFrame with a single column named 'datetime', containing hourly datetimes for the specified month.
    """
    # Calculate the start datetime of the month
    start_date = datetime(year, month, 1)
    # Handle December separately to avoid month overflow
    if month == 12:
        end_date = datetime(year + 1, 1, 1)
    else:
        end_date = datetime(year, month + 1, 1)
    
    # Create a DataFrame from the datetime range
    df = pl.DataFrame({
        "pickup_datetime_hour": pl.datetime_range(
            start=start_date, 
            end=end_date, 
            interval="1h", 
            eager=True, 
            time_unit="ns",
            closed="left")
    })
    
    return df

hourly_df = generate_hourly_datetimes_with_ranges(2020,1)
hourly_df.head()

pickup_datetime_hour
datetime[ns]
2020-01-01 00:00:00
2020-01-01 01:00:00
2020-01-01 02:00:00
2020-01-01 03:00:00
2020-01-01 04:00:00


In [32]:
def aggregate_pickups_into_hourly_data(df: pl.DataFrame, year: int, month: int) -> pl.DataFrame:
    """
    Aggregates the number of pickups for each location and hour in the provided DataFrame.

    This function groups the DataFrame by the pickup location ID and the pickup hour, and then counts the number of
    pickups for each group. To ensure that the resulting DataFrame contains all hours in the month, the function first
    generates a DataFrame containing hourly datetimes for the specified month and year, and then performs a left join
    with the aggregated pickup data.

    Parameters:
    - year (int): The year of the month for which to aggregate the pickup data.
    - month (int): The month for which to aggregate the pickup data.
    - df (pl.DataFrame): The DataFrame containing the pickup data to be aggregated.

    Returns:
    - pl.DataFrame: The DataFrame containing the aggregated pickup data.
    """
    # Truncate the pickup datetime to the nearest hour and group by the pickup location ID
    hourly_pickups = (
        df
        .group_by([
            pl.col("pickup_datetime").dt.truncate("1h").alias("pickup_datetime_hour"),
            pl.col("pickup_location_id")
        ])
        .agg(
            pl.col("pickup_location_id").count().alias("num_pickups")
        )
    )
    
    hourly_df = generate_hourly_datetimes_with_ranges(year, month)
    
    
    return ( hourly_df
            .join(hourly_pickups, on="pickup_datetime_hour", how="left")
            .with_columns(
                pl.col("num_pickups").fill_null(pl.lit(0))
            )
    )


hourly_df_complete = aggregate_pickups_into_hourly_data(df, 2020, 1)
hourly_df_complete.head()

pickup_datetime_hour,pickup_location_id,num_pickups
datetime[ns],i64,u32
2020-01-01 00:00:00,143,128
2020-01-01 00:00:00,50,183
2020-01-01 00:00:00,142,488
2020-01-01 00:00:00,79,721
2020-01-01 00:00:00,229,320


In [56]:
def get_time_lags(df: pl.DataFrame, n_lags: int) -> pl.DataFrame:
    """
    Generates time-lagged features for the number of pickups.

    This function takes a DataFrame and an integer n_lags to generate n_lags new columns in the DataFrame. Each new column represents the number of pickups n hours ago, where n ranges from 1 to n_lags. The function sorts the DataFrame by 'pickup_location_id' and 'pickup_datetime_hour' before shifting to ensure that the lagged values are correctly aligned with the corresponding times and locations.

    Parameters:
    - df (pl.DataFrame): The DataFrame containing the pickup data.
    - n_lags (int): The number of lagged time periods to generate.

    Returns:
    - pl.DataFrame: The original DataFrame with n_lags new columns added, each representing the number of pickups n hours ago.
    """
    return (
        df
        .with_columns([
            pl.col("num_pickups").sort_by(["pickup_location_id", "pickup_datetime_hour"]).shift(i).over("pickup_location_id").alias(f"num_pickups_{i}h_ago") for i in range(1, n_lags+1)
        ])
        .drop_nulls()
    )
    
# time_lags = get_time_lags(hourly_df_complete, 3).head()

In [57]:
def generate_ts_features_for_file(year: int, month: int, n_lags: int) -> pl.DataFrame:
    """
    Generates time-lagged features for the number of pickups in a given month.

    This function reads the pickup data for the specified year and month, aggregates it into hourly data, and then generates time-lagged features using the get_time_lags function.

    Parameters:
    - year (int): The year of the month for which to generate time-lagged features.
    - month (int): The month for which to generate time-lagged features.
    - n_lags (int): The number of lagged time periods to generate.

    Returns:
    - pl.DataFrame: The DataFrame containing the time-lagged features.
    """
    return (
        read_file(PROCESSED_DATA_DIR, year, month)
        .pipe(aggregate_pickups_into_hourly_data, year, month)
        .pipe(get_time_lags, n_lags)
        .write_parquet(TRANSFORMED_DATA_DIR / FILE_PATTERN.format(year=year, month=month))
    )