In [None]:
from shapelets.data import sandbox

# Convert parquet to CSV
sandbox().from_parquet("taxis", ["../Benchmarks/nyc-taxi/2009/01/*.parquet"]).to_csv('sample.csv')

### Pandas

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('sample.csv')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

            pickup_at
2009-01-01  0            1.714221
            1            1.723661
            2            1.679692
            3            1.644623
            4            1.566899
                           ...   
2009-01-31  19           1.834419
            20           1.841117
            21           1.874803
            22           1.901640
            23           1.937456
Name: passenger_count, Length: 744, dtype: float64

### Pandas with PyArrow engine

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('sample.csv', engine='pyarrow')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

            pickup_at
2009-01-01  0            1.714221
            1            1.723661
            2            1.679692
            3            1.644623
            4            1.566899
                           ...   
2009-01-31  19           1.834419
            20           1.841117
            21           1.874803
            22           1.901640
            23           1.937456
Name: passenger_count, Length: 744, dtype: float64

### Polars

In [8]:
import polars as pl

In [11]:
data = pl.scan_csv('sample.csv', try_parse_dates=True)

data.group_by(
        [
            pl.col("pickup_at").cast(pl.Date).alias("pickup_at_date"),
            pl.col("pickup_at").dt.hour().alias("pickup_at_hour"),
        ]
    ).agg(pl.mean("passenger_count")).collect()

pickup_at_date,pickup_at_hour,passenger_count
date,i8,f64
2009-01-08,20,1.691188
2009-01-26,14,1.599633
2009-01-18,20,1.838339
2009-01-14,18,1.632807
2009-01-01,9,1.420493
…,…,…
2009-01-03,5,1.719294
2009-01-09,20,1.756514
2009-01-14,0,1.666934
2009-01-30,13,1.618202


### Shapelets

In [1]:
from shapelets.data import sandbox

In [6]:
playground = sandbox()

playground.from_csv("taxis", ["sample.csv"])

result = playground.from_sql("""
    SELECT
        AVG(passenger_count)                                                    
    FROM taxis
        GROUP BY extract('day' from dropoff_at), extract('hour' from dropoff_at)                                                                           
""").execute()

result.to_pandas()

Unnamed: 0,avg(passenger_count)
0,1.781688
1,1.731868
2,1.690869
3,1.657708
4,1.586778
...,...
739,1.825804
740,1.837693
741,1.867162
742,1.903295
