In [None]:
import numpy as np
import dask.dataframe as dd

import hvplot.dask, hvplot.pandas
import panel as pn
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [None]:
from dask.distributed import Client, wait

client = Client()
client

In [None]:
## TODO: use proper import as in other notebooks

ddf = dd.read_parquet(
    "s3://saturn-titan/data/nyc-taxi/taxi_2017_2019/", 
    assume_missing=True, 
    engine="pyarrow"
)
ddf

### Optional: downsample

To run this notebook quickly and get a sense of the results you can downsample and persist.

In [None]:
# ddf = ddf.sample(frac=0.01).persist()

## Augment data

We'll distill some features out of the datetime component of the data. This is similar to the feature engineering that is done in other places in this demo, but we'll only create the features that'll be most useful in the visuals. 

In [None]:
ddf["pickup_hour"] = ddf.pickup_datetime.dt.hour
ddf["dropoff_hour"] = ddf.dropoff_datetime.dt.hour
ddf["pickup_weekday"] = ddf.pickup_datetime.dt.weekday
ddf["dropoff_weekday"] = ddf.dropoff_datetime.dt.weekday
ddf["percent_tip"] = (ddf["tip_amount"] / ddf["fare_amount"]) * 100

## Timeseries datasets

We'll resample to an hourly timestep so that we don't have to pass around so much data later on.

In [None]:
tips = ddf[["pickup_datetime", "percent_tip"]]
tips = tips.set_index("pickup_datetime").resample('1H').mean().compute()

# make sure to only include real values
start = ddf.head(1).pickup_datetime.values[0]
end = ddf.tail(1).pickup_datetime.values[0]
trimmed = tips[start:end]

trimmed.to_csv("./data/pickup_average_percent_tip_timeseries.csv")

In [None]:
fare = ddf[["pickup_datetime", "fare_amount"]]
fare = fare.set_index("pickup_datetime").resample('1H').mean().compute()

# make sure to only include real values
start = ddf.head(1).pickup_datetime.values[0]
end = ddf.tail(1).pickup_datetime.values[0]
trimmed = fare[start:end]

trimmed.to_csv("./data/pickup_average_fare_timeseries.csv")

## Aggregate datasets

Since our data is rather large and will mostly be viewed in grouped aggregates, we can do some aggregation now and save it off for use in plots later. 

In [None]:
for value in ["pickup", "dropoff"]:
    data = (ddf
            .groupby([
                f"{value}_taxizone_id", 
                f"{value}_hour",  
                f"{value}_weekday",
            ])
            .agg({
                "fare_amount": ["mean", "count", "sum"],
                "trip_distance": ["mean", "sum"],
                "percent_tip": ["mean", "count", "sum"],
            })
            .compute()
           )
    data.columns = data.columns.to_flat_index()
    data = data.rename({
        ("fare_amount", "mean"): "average_fare",
        ("fare_amount", "count"): "total_rides",
        ("fare_amount", "sum"): "total_fare",
        ("trip_distance", "sum"): "total_trip_distance",
        ("trip_distance", "mean"): "average_trip_distance",
        ("percent_tip", "mean"): "average_percent_tip",
        ("percent_tip", "count"): "total_tips",
        ("percent_tip", "sum"): "total_percent_tip",
        
    }, axis=1).reset_index(level=[1, 2])
    data.to_csv(f"data/{value}_grouped.csv")

## Examples

To make use of the new datasets we can visualize all the data at once using a grouped heatmap

In [None]:
data.hvplot.heatmap(
    x="dropoff_weekday", 
    y="dropoff_hour", 
    C="total_rides",
    groupby="dropoff_taxizone_id", 
    responsive=True, min_height=600, cmap="viridis",
    colorbar=False,
)

Or we can aggregate the data along various axes to derive new meaning

In [None]:
aggregated = data.groupby("dropoff_taxizone_id")[["total_fare", "total_rides"]].sum()
aggregated["average_fare"] = (aggregated.total_fare / aggregated.total_rides) * 100

aggregated.head()

This aggregated dataset can be paired with other information such as geography.

In [None]:
import geopandas as gpd

zones = gpd.read_file('./data/taxi_zones.shp').to_crs('epsg:4326')
joined = zones.join(aggregated, on="LocationID")

joined.hvplot(x="longitude", y="latitude", c="average_fare", logz=True,
              geo=True, alpha=0.5, cmap="reds", hover_cols=["zone", "borough"], 
              title=f"Ride volume by dropoff location", height=600, width=800, clim=(0, 100), cmap="viridis"

### Payment Type Pie Chart

Other vizualisations can be contructed straight from the raw data and saved for embedding in the dashboard later.

In [None]:
payment_type = ddf.payment_type.value_counts().compute()

In [None]:
new_index = payment_type.index.map({
    "1": "Credit card", 
    "2": "Cash", 
    "3": "No charge", 
    "4": "Dispute", 
    "5": "Unknown", 
    "6": "Voided trip"
}).astype("category")

payment_type.index = new_index

In [None]:
payment_type.name = "value"
payment_type.index.name = "payment_type"

In [None]:
from math import pi

import pandas as pd

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.transform import cumsum

output_notebook()

data = payment_type.reset_index()
data['angle'] = data['value']/data['value'].sum() * 2*pi
data["label"] = data.value.apply(lambda x: f"{x/1e6: .0f} M")
data["frac"] = data.angle.apply(lambda x: f"{x / (2*pi): .0%}")

data = data[:2]
data['color'] = ["thistle", "lightblue"]


p = figure(plot_height=350, plot_width=350, toolbar_location=None,
           x_range=(-.5, .5), y_range=(0, 2), title="Payment Type")

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', source=data)

p.text(x=[-0.2, 0.07], y=[1.4, 0.7], text=data["payment_type"].astype(str) + ":\n  " + data["label"] + "\n  " + data["frac"],
       text_align="left", text_baseline="top", text_font_size="15px")


p.title.text_font_size = "20px"
p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None
p.outline_line_width = 0

show(p)

In [None]:
from bokeh.io import export_svgs

p.output_backend = "svg"
export_svgs(p, filename="pie_chart.svg")