In [None]:
from shapelets.data import sandbox

# Convert parquet to CSV
sandbox().from_parquet("taxis", ["../Benchmarks/nyc-taxi/2009/01/*.parquet"]).to_csv('sample.csv')

### Pandas

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('sample.csv')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

            pickup_at
2009-01-01  0            1.714221
            1            1.723661
            2            1.679692
            3            1.644623
            4            1.566899
                           ...   
2009-01-31  19           1.834419
            20           1.841117
            21           1.874803
            22           1.901640
            23           1.937456
Name: passenger_count, Length: 744, dtype: float64

### Pandas with PyArrow engine

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('sample.csv', engine='pyarrow')

df['pickup_at'] = pd.to_datetime(df['pickup_at'])
df = df.set_index('pickup_at')

df.groupby([df.index.date,df.index.hour])['passenger_count'].mean()

            pickup_at
2009-01-01  0            1.714221
            1            1.723661
            2            1.679692
            3            1.644623
            4            1.566899
                           ...   
2009-01-31  19           1.834419
            20           1.841117
            21           1.874803
            22           1.901640
            23           1.937456
Name: passenger_count, Length: 744, dtype: float64

### Polars

In [25]:
import polars as pl

In [26]:
data = pl.scan_csv('sample.csv', try_parse_dates=True)

data.group_by(
        [
            pl.col("pickup_at").cast(pl.Date).alias("pickup_at_date"),
            pl.col("pickup_at").dt.hour().alias("pickup_at_hour"),
        ]
    ).agg(pl.mean("passenger_count")).collect()

pickup_at_date,pickup_at_hour,passenger_count
date,i8,f64
2009-01-31,5,1.663196
2009-01-12,9,1.524339
2009-01-03,11,1.809372
2009-01-01,19,1.950497
2009-01-20,5,1.549116
…,…,…
2009-01-22,10,1.565153
2009-01-18,12,1.765568
2009-01-27,17,1.645984
2009-01-03,14,1.848984


### Shapelets

In [9]:
from shapelets.data import sandbox

In [14]:
playground = sandbox()

playground.from_csv("taxis", ["sample.csv"])

result = playground.from_sql("""
    SELECT
        CAST(pickup_at as DATE) as pickup_at_date,
        EXTRACT('hour' from pickup_at) as pickup_at_hour,                      
        AVG(passenger_count) as passenger_count                                                   
    FROM taxis
    GROUP BY 
        pickup_at_date, pickup_at_hour                                                                            
""").execute()

result.to_pandas()

Unnamed: 0,pickup_at_date,pickup_at_hour,passenger_count
0,2009-01-22,8,1.552967
1,2009-01-02,9,1.664195
2,2009-01-27,19,1.634457
3,2009-01-15,2,1.679269
4,2009-01-15,10,1.563418
...,...,...,...
739,2009-01-23,22,1.839678
740,2009-01-27,2,1.660264
741,2009-01-08,3,1.637116
742,2009-01-06,2,1.671460
