## Pandas

In [1]:
from datetime import datetime
from glob import glob

from pandas import concat
from pandas import read_parquet

start = datetime.now()

df = concat(map(read_parquet, glob("./data/yellow_tripdata_202*-*.parquet")))

print(f"DF has {len(df)} rows.")

res = (
    df.groupby(["PULocationID", "DOLocationID"])
    .agg(
        {
            "total_amount": "sum",
            "fare_amount": "sum",
            "tolls_amount": "sum",
            "tip_amount": "sum",
            "congestion_surcharge": "sum",
            "trip_distance": "mean",
        }
    )
    .sort_values(by="fare_amount", ascending=False)
    .head(10)
)
print(res)
print(f"Runtime: {datetime.now() - start}")

DF has 63908140 rows.
                           total_amount  fare_amount  tolls_amount  \
PULocationID DOLocationID                                            
132          265             7844322.75   6630563.49     331608.93   
             230             6711888.08   4896347.02     580221.42   
264          264             6138857.16   4620833.45     115205.80   
132          48              5030967.64   3698064.12     432742.37   
             132             3632006.46   2979093.82      59908.99   
237          236             5148521.76   2885930.33        157.61   
132          164             3897581.91   2801727.08     345591.71   
138          230             4160577.10   2743316.75     488300.84   
236          237             4623748.88   2676135.43        233.43   
132          170             3561579.84   2563033.74     315306.64   

                           tip_amount  congestion_surcharge  trip_distance  
PULocationID DOLocationID                                   

## Polars

### Note: Restarted the kernel here.

In [1]:
import polars as pl
from datetime import datetime

start = datetime.now()

df = pl.read_parquet("./data/yellow_tripdata_202*-*.parquet")

print(f"DF has {len(df)} rows.")

res = (
    df.groupby(["DOLocationID","PULocationID"],False)
    .agg([
        pl.col("total_amount").sum(),
        pl.col("fare_amount").sum(),
        pl.col("tolls_amount").sum(),
        pl.col("tip_amount").sum(),
        pl.col("congestion_surcharge").sum(),
        pl.col("trip_distance").mean()
    ])
    .sort("fare_amount", reverse=True)
    .limit(10)
)
print(res)
print(f"Runtime: {datetime.now() - start}")

DF has 63908140 rows.
shape: (10, 8)
┌────────────┬────────────┬────────────┬───────────┬────────────┬──────────┬────────────┬────────────┐
│ DOLocation ┆ PULocation ┆ total_amou ┆ fare_amou ┆ tolls_amou ┆ tip_amou ┆ congestion ┆ trip_dista │
│ ID         ┆ ID         ┆ nt         ┆ nt        ┆ nt         ┆ nt       ┆ _surcharge ┆ nce        │
│ ---        ┆ ---        ┆ ---        ┆ ---       ┆ ---        ┆ ---      ┆ ---        ┆ ---        │
│ i64        ┆ i64        ┆ f64        ┆ f64       ┆ f64        ┆ f64      ┆ f64        ┆ f64        │
╞════════════╪════════════╪════════════╪═══════════╪════════════╪══════════╪════════════╪════════════╡
│ 265        ┆ 132        ┆ 7.8443e6   ┆ 6.6306e6  ┆ 331608.93  ┆ 705419.0 ┆ 7492.5     ┆ 22.037511  │
│            ┆            ┆            ┆           ┆            ┆ 3        ┆            ┆            │
│ 230        ┆ 132        ┆ 6.7119e6   ┆ 4.8963e6  ┆ 580221.42  ┆ 731915.4 ┆ 232930.25  ┆ 18.296969  │
│            ┆            ┆         

## PySpark Pandas

### Note: Restarted Kernel here.

In [1]:
from datetime import datetime

from pyspark.pandas import read_parquet

start = datetime.now()

df = read_parquet("./data/yellow_tripdata_202*-*.parquet")

print(f"DF has {len(df)} rows.")

res = (
    df.groupby(["PULocationID", "DOLocationID"])
    .agg(
        {
            "total_amount": "sum",
            "fare_amount": "sum",
            "tolls_amount": "sum",
            "tip_amount": "sum",
            "congestion_surcharge": "sum",
            "trip_distance": "mean",
        }
    )
    .sort_values(by="fare_amount", ascending=False)
    .head(10)
)
print(res)
print(f"Runtime: {datetime.now() - start}")



                           total_amount  fare_amount  tolls_amount  tip_amount  congestion_surcharge  trip_distance
PULocationID DOLocationID                                                                                          
132          265           7.844323e+06   6630563.49     331608.93   705419.03               7492.50      22.037511
             230           6.711888e+06   4896347.02     580221.42   731915.43             232930.25      18.296969
264          264           6.138857e+06   4620833.45     115205.80   778817.41             319950.00       3.175256
132          48            5.030968e+06   3698064.12     432742.37   525715.76             175940.00      18.537273
             132           3.632006e+06   2979093.82      59908.99   391418.84               8122.50       2.439654
237          236           5.148522e+06   2885930.33        157.61   697629.42            1069527.50       1.092710
132          164           3.897582e+06   2801727.08     345591.71   464

                                                                                