In [None]:
from shapelets.data import sandbox

# Convert parquet to CSV
sandbox().from_parquet("taxis", ["../Benchmarks/nyc-taxi/2009/01/*.parquet"]).to_csv('sample.csv')

### Pandas

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('sample.csv')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

### Pandas with PyArrow engine

In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv('sample.csv', engine='pyarrow')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

### Polars

In [None]:
import polars as pl

In [None]:
data = pl.scan_parquet('../Benchmarks/nyc-taxi/2009/01/*.parquet')

data.group_by(
        [
            pl.col("pickup_at").cast(pl.Date).alias("pickup_at_date"),
            pl.col("pickup_at").dt.hour().alias("pickup_at_hour"),
        ]
    ).agg(pl.mean("passenger_count")).collect()

### Shapelets

In [None]:
from shapelets.data import sandbox

In [None]:
playground = sandbox()

taxis = playground.from_parquet("taxis", ["../Benchmarks/nyc-taxi/2009/01/*.parquet"])

result = playground.from_sql("""
    SELECT
        AVG(passenger_count)                                                    
    FROM taxis
        GROUP BY extract('day' from dropoff_at), extract('hour' from dropoff_at)                                                                           
""").execute()

result.to_pandas()