# One Month Scenario
## Pandas


In [None]:
import pandas as pd 

df = pd.read_parquet('../nyc-taxi/2009/01')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()


## Polars

In [None]:
import polars as pl
data = pl.read_parquet('../nyc-taxi/2009/01/*.parquet')

data.groupby(
        [
            pl.col("pickup_at").dt.strftime("%D").alias("pickup_at_date"),
            pl.col("pickup_at").dt.hour().alias("pickup_at_hour"),
        ]
    ).agg(pl.mean("passenger_count"))


## Shapelets

In [None]:
import shapelets as sh
from shapelets.data.functions import avg, getDate, hour, minute

playground = sh.data.sandbox()

taxis = playground.from_parquet("../nyc-taxi/2009/01/*.parquet")

result = playground.map(
    (getDate(row.pickup_at), hour(row.pickup_at), avg(row.passenger_count)) 
    for row in taxis
)

result.to_pandas()


# One Year Scenario

## Pandas (Large memory consumption)

In [None]:
import pandas as pd 

df = pd.read_parquet('../nyc-taxi/2009')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()


## Polars (Large memory consumption)

In [None]:
import polars as pl
data = pl.read_parquet('../nyc-taxi/2009/**/*.parquet')

data.groupby(
        [
            pl.col("pickup_at").dt.strftime("%D").alias("pickup_at_date"),
            pl.col("pickup_at").dt.hour().alias("pickup_at_hour"),
        ]
    ).agg(pl.mean("passenger_count"))

## Shapelets

In [None]:
import shapelets as sh
from shapelets.data.functions import sum, getDate, hour, minute, avg

playground = sh.data.sandbox()

taxis = playground.from_parquet("../nyc-taxi/2009/**/*.parquet")

result = playground.map(
    (getDate(row.pickup_at), hour(row.pickup_at), avg(row.passenger_count)) 
    for row in taxis
)

result.to_pandas()