## Setup

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import pyarrow as pa
import polars as pl

import os

## Data

These data come from NYC Open Data - specifically the parquet files can be obtained from [toddwschneider/nyc-taxi-data](https://github.com/toddwschneider/nyc-taxi-data) on GitHub. Then urls for the specific files can be found [here](https://github.com/toddwschneider/nyc-taxi-data/blob/master/setup_files/raw_data_urls.txt).

In [None]:
#os.listdir(os.path.expanduser("~/Scratch/nyc_taxi/"))

In [None]:
df_eager = pl.read_parquet("~/Scratch/nyc_taxi/yellow_tripdata_2022-*.parquet")
df_eager

## Demo

## Lazy data

In [None]:
df_lazy  = pl.scan_parquet("~/Scratch/nyc_taxi/yellow_tripdata_2022-*.parquet")
df_lazy.show_graph()

## Performance

In [None]:
def lazy():
    df_lazy.filter(
      (pl.col("fare_amount") > 0) &
      (pl.col("tip_amount") > 0)
    ).select([
      "tpep_pickup_datetime", "tip_amount", "fare_amount", "total_amount",
      (pl.col("tip_amount") / (pl.col("total_amount") - pl.col("tip_amount"))).alias("tip_perc"),
      pl.col("tpep_pickup_datetime").dt.hour().alias("hour"),
      pl.col("tpep_pickup_datetime").dt.weekday().alias("wday")
    ]).groupby(
      ["hour","wday"]
    ).agg([
      pl.mean("tip_perc").alias("mean_tip_perc")
    ]).with_columns([
      pl.col("mean_tip_perc").round(3)
    ]).with_columns([
      pl.all().sort_by(["wday", "hour"])
    ]).collect(
    ).pivot(
      values="mean_tip_perc", index="wday", columns="hour"
    )

def eager():
    df_eager.filter(
      (pl.col("fare_amount") > 0) &
      (pl.col("tip_amount") > 0)
    ).select([
      "tpep_pickup_datetime", "tip_amount", "fare_amount", "total_amount",
      (pl.col("tip_amount") / (pl.col("total_amount") - pl.col("tip_amount"))).alias("tip_perc"),
      pl.col("tpep_pickup_datetime").dt.hour().alias("hour"),
      pl.col("tpep_pickup_datetime").dt.weekday().alias("wday")
    ]).groupby(
      ["hour","wday"]
    ).agg([
      pl.mean("tip_perc").alias("mean_tip_perc")
    ]).with_columns([
      pl.col("mean_tip_perc").round(3)
    ]).with_columns([
      pl.all().sort_by(["wday", "hour"])
    ]).pivot(
      values="mean_tip_perc", index="wday", columns="hour"
    )

In [None]:
%timeit lazy()

In [None]:
%timeit eager()