In [None]:
import numpy as np
import dask.dataframe as dd

import hvplot.dask, hvplot.pandas
import panel as pn
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [None]:
from dask.distributed import Client, wait

client = Client()
client

In [None]:
ddf = dd.read_parquet(
    "s3://saturn-titan/data/nyc-taxi/taxi_2017_2019/", 
    assume_missing=True, 
    engine="pyarrow"
)
ddf

In [None]:
ddf["tip%"] = (ddf["tip_amount"] / ddf["total_amount"]) * 100

In [None]:
widget = pn.widgets.Select(options=["tip%", "tip_amount", "trip_distance", "fare_amount"])

def heatmap(value=widget):
    return ddf.hvplot.heatmap(
        x="pickup_datetime.dt.dayofweek", 
        y="pickup_datetime.dt.hour", 
        C=value, 
        title=value,
        xlabel="Day of Week",
        ylabel="Hour of Day",
        xticks=[(0, 'Mon'), (1, 'Tues'), (2, 'Wed'), (3, 'Thur'), (4, 'Fri'), (5, 'Sat'), (6, 'Sun')],
        height=800, width=300, colorbar=False, cmap="coolwarm", clim=(8, 12)
    ).aggregate(function=np.mean).opts(toolbar=None, xrotation=90)

pn.Column(heatmap)

In [None]:
ddf['pickup_weekday'] = ddf.pickup_datetime.dt.weekday
ddf['pickup_hour'] = ddf.pickup_datetime.dt.hour
ddf['dropoff_hour'] = ddf.dropoff_datetime.dt.hour

In [None]:
gb = ddf[["pickup_weekday", "pickup_hour", "tip%"]].groupby(["pickup_weekday", "pickup_hour"])
tip = gb["tip%"].mean().compute().unstack()
tip.to_csv(f"data/tip_percent.csv")

In [None]:
fare_tip = ddf.hvplot.scatter(x="fare_amount", y="tip_amount", cmap="fire_r", xlim=(0, 50), ylim=(0, 5), 
                              datashade=True, x_sampling=.5, y_sampling=.1, width=300)
fare_tip

In [None]:
tips = ddf[["pickup_datetime", "tip%"]]
tips = tips.set_index("pickup_datetime").resample('1H').mean().compute()

# make sure to only include real values
start = ddf.head(1).pickup_datetime.values[0]
end = ddf.tail(1).pickup_datetime.values[0]
trimmed = tips[start:end]

trimmed.to_csv("./data/tip_timeseries.csv")

In [None]:
tips_timeseries = tips.hvplot(y="tip%", xlim=(pd.Timestamp("2018-01"), pd.Timestamp("2018-10")), ylim=(9, 13))
tips_timeseries

In [None]:
ddf.hvplot.hist("tip%", invert=True, width=300)

In [None]:
import geopandas as gpd
zones = gpd.read_file('./data/taxi_zones.shp').to_crs('epsg:4326')

pickup_zone = ddf[["pickup_taxizone_id", "tip%"]].groupby("pickup_taxizone_id").mean().compute()

joined = zones.join(pickup_zone, on="LocationID")

# put the dropoff/pickup on a widget with same clim
pickup_map = joined.hvplot(c="tip%", geo=True, tiles="CartoLight", alpha=0.6, cmap="reds", hover_cols=["zone", "borough"], title="pickups", height=600, width=800,clim=(0, 20))
pickup_map

In [None]:
ddf['dropoff_hour'] = ddf.dropoff_datetime.dt.hour

for value in ["pickup", "dropoff"]:
    gb = ddf[[f"{value}_taxizone_id", f"{value}_hour", "tip%"]].groupby([f"{value}_taxizone_id", f"{value}_hour"])
    volume = gb["tip%"].mean().compute().unstack()
    volume["total"] = volume.mean(axis=1)
    volume.to_csv(f"./data/{value}_tip.csv")

In [None]:
pn.Row(
    pn.Column(
        """
        ## NY City Taxi Data
        
        Explore the publicly available NYC Taxi dataset and a linear regression model 
        that can predict the fare amount of a taxi ride using attributes related to rider pickup.
        """,
        pickup_map,
        pn.Row(tips_timeseries, tips.hvplot.hist("tip%", invert=True, width=300)),
    ),
    pn.Column(heatmap)
).servable()