# Prepare NYC taxi data

In [None]:
import numpy as np
import scipp as sc
import h5py

In [None]:
f = h5py.File("yellow_taxi_2015_f32s.hdf5")
load = [
    "dropoff_datetime",
    "dropoff_latitude",
    "dropoff_longitude",
    "pickup_datetime",
    "pickup_latitude",
    "pickup_longitude",
    "fare_amount",
    "trip_distance",
    "tip_amount",
]
ds = {}
for key in load:
    ds[key] = sc.array(dims=["row"], values=f[f"table/columns/{key}/data"][...][::8])

da = sc.DataArray(
    data=sc.ones(sizes=ds[load[0]].sizes, unit="counts"),
    coords={key: val for key, val in ds.items()},
)
da.coords["dropoff_datetime"].unit = "ns"
da.coords["dropoff_latitude"] = da.coords.pop("dropoff_latitude").to(dtype='float64')
da.coords["dropoff_latitude"].unit = "degrees"
da.coords["dropoff_longitude"] = da.coords.pop("dropoff_longitude").to(dtype='float64')
da.coords["dropoff_longitude"].unit = "degrees"

da.coords["pickup_datetime"].unit = "ns"
da.coords["pickup_latitude"] = da.coords.pop("pickup_latitude").to(dtype='float64')
da.coords["pickup_latitude"].unit = "degrees"
da.coords["pickup_longitude"] = da.coords.pop("pickup_longitude").to(dtype='float64')
da.coords["pickup_longitude"].unit = "degrees"

da.coords["fare_amount"] = da.coords["fare_amount"].to(dtype='float64')
da.coords["fare_amount"].unit = "dollars"
da.coords["tip_amount"] = da.coords["tip_amount"].to(dtype='float64')
da.coords["tip_amount"].unit = "dollars"
da.coords["trip_distance"] = da.coords["trip_distance"].to(dtype='float64')
da.coords["trip_distance"].unit = "miles"

da.coords["dropoff_datetime"] = da.coords["dropoff_datetime"] + sc.epoch(unit="ns")
da.coords["pickup_datetime"] = da.coords["pickup_datetime"] + sc.epoch(unit="ns")

# Add hour of the day coord
da.coords["dropoff_hour"] = sc.array(
    dims=da.dims,
    values=(
        da.coords["dropoff_datetime"].values.astype("M8[h]")
        - da.coords["dropoff_datetime"].values.astype("datetime64[D]")
    ).astype(int),
)
da.coords["pickup_hour"] = sc.array(
    dims=da.dims,
    values=(
        da.coords["pickup_datetime"].values.astype("M8[h]")
        - da.coords["pickup_datetime"].values.astype("datetime64[D]")
    ).astype(int),
)

da.coords["dropoff_datetime"] = da.coords["dropoff_datetime"].to(unit='s')
da.coords["pickup_datetime"] = da.coords["pickup_datetime"].to(unit='s')

da

In [None]:
dx = 0.32
lon_min = -74.05
lon_max = lon_min + dx
lat_min = 40.595
lat_max = lat_min + dx

binned = da.bin(
    dropoff_latitude=sc.linspace(
        "dropoff_latitude", lat_min, lat_max, 2, unit="degrees"
    ),
    dropoff_longitude=sc.linspace(
        "dropoff_longitude", lon_min, lon_max, 2, unit="degrees"
    ),
    trip_distance=sc.linspace(
        "trip_distance", 0.01, 80, 2, unit="miles"
    )
)
binned

In [None]:
binned.values[0]

In [None]:
binned.values[0].to_hdf5('nyc_taxi_data_2015_small.h5')

In [None]:
da = binned.values[0].copy()
da

In [None]:
zero = sc.scalar(0., unit='dollar')
sel = (da.coords['tip_amount'] > zero) & (da.coords['fare_amount'] > zero)
da = da[sel]
da.coords['tip_fraction'] = da.coords['tip_amount'] / da.coords['fare_amount']
da.bin(dropoff_hour=24).bins.coords['tip_fraction']

In [None]:
da.bin(dropoff_hour=24).bins.coords['tip_fraction'].bins.mean().plot()