In [1]:
import s3fs
import dask.dataframe as dd
import numpy as np
import pandas as pd
import os
import datetime
import hvplot.dask, hvplot.pandas

import warnings
warnings.simplefilter("ignore")

In [2]:
from dask.distributed import Client
from dask_saturn import SaturnCluster
import time

cluster = SaturnCluster(n_workers=10, scheduler_size='2xlarge', worker_size='2xlarge', nthreads=8)
client = Client(cluster)

cluster

[2020-08-07 21:46:38] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
data_path = "s3://saturn-titan/data/nyc-taxi/taxi_2017_2019/"
taxi = dd.read_parquet(data_path, engine='pyarrow', assume_missing=True)

In [4]:
# only use zones for ML tasks
# change to spatial join to pull in old data if there's time
ddf = taxi[(taxi.pickup_taxizone_id.notnull() & taxi.dropoff_taxizone_id.notnull()) &
           (taxi.pickup_datetime >= datetime.datetime(2016, 7, 1)) & 
           (taxi.pickup_datetime < datetime.datetime(2020, 1, 1))]
ddf

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
npartitions=50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,object,datetime64[ns],datetime64[ns],object,float64,object,object,float64,float64,float64,float64,float64,float64,object,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


### Optional: downsample

To run this notebook quickly on a smaller cluster you can downsample and persist.

In [5]:
# ddf = ddf.sample(frac=0.01).persist()

## Augment data

We'll distill some features out of the datetime component of the data. This is similar to the feature engineering that is done in other places in this demo, but we'll only create the features that'll be most useful in the visuals. 

In [6]:
ddf["pickup_hour"] = ddf.pickup_datetime.dt.hour
ddf["dropoff_hour"] = ddf.dropoff_datetime.dt.hour
ddf["pickup_weekday"] = ddf.pickup_datetime.dt.weekday
ddf["dropoff_weekday"] = ddf.dropoff_datetime.dt.weekday
ddf["percent_tip"] = (ddf["tip_amount"] / ddf["fare_amount"]).replace([np.inf, -np.inf], np.nan) * 100

We'll take out the extreme high values since they disrupt the mean

In [8]:
ddf["percent_tip"] = ddf["percent_tip"].apply(lambda x: np.nan if x > 1000 else x)

Since the dataset is now small enough to fit in worker memory, we'll drop any unneeded columns and repartition and persist for easy access later.

In [9]:
ddf = ddf.drop([
    'vendor_id', 'rate_code_id', 'store_and_fwd_flag','pickup_latitude',
    'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'mta_tax',
    'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge'
], axis=1)
ddf = ddf.repartition(npartitions=80).persist()

In [10]:
ddf.columns

Index(['pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'pickup_taxizone_id', 'dropoff_taxizone_id',
       'payment_type', 'fare_amount', 'tip_amount', 'pickup_hour',
       'dropoff_hour', 'pickup_weekday', 'dropoff_weekday', 'percent_tip'],
      dtype='object')

## Timeseries datasets

We'll resample to an hourly timestep so that we don't have to pass around so much data later on.

In [11]:
tip_ddf = ddf[["pickup_datetime", "percent_tip"]].set_index("pickup_datetime").dropna()
tips = tip_ddf.resample('1H').mean().compute()

tips.to_csv("./data/pickup_average_percent_tip_timeseries.csv")

In [12]:
fare_ddf = ddf[["pickup_datetime", "fare_amount"]].set_index("pickup_datetime").dropna()
fare = fare_ddf.resample('1H').mean().compute()

fare.to_csv("./data/pickup_average_fare_timeseries.csv")

## Aggregate datasets

Since our data is rather large and will mostly be viewed in grouped aggregates, we can do some aggregation now and save it off for use in plots later. 

In [13]:
for value in ["pickup", "dropoff"]:
    data = (ddf
            .groupby([
                f"{value}_taxizone_id", 
                f"{value}_hour",  
                f"{value}_weekday",
            ])
            .agg({
                "fare_amount": ["mean", "count", "sum"],
                "trip_distance": ["mean"],
                "percent_tip": ["mean"],
            })
            .compute()
           )
    data.columns = data.columns.to_flat_index()
    data = data.rename({
        ("fare_amount", "mean"): "average_fare",
        ("fare_amount", "count"): "total_rides",
        ("fare_amount", "sum"): "total_fare",
        ("trip_distance", "mean"): "average_trip_distance",
        ("percent_tip", "mean"): "average_percent_tip",
        
    }, axis=1).reset_index(level=[1, 2])
    data.to_csv(f"data/{value}_grouped_by_zone_and_time.csv")

grouped_zone_and_time = data

In [14]:
for value in ["pickup", "dropoff"]:
    data = (ddf
            .groupby([
                f"{value}_taxizone_id", 
            ])
            .agg({
                "fare_amount": ["mean", "count", "sum"],
                "trip_distance": ["mean"],
                "percent_tip": ["mean"],
            })
            .compute()
           )
    data.columns = data.columns.to_flat_index()
    data = data.rename({
        ("fare_amount", "mean"): "average_fare",
        ("fare_amount", "count"): "total_rides",
        ("fare_amount", "sum"): "total_fare",
        ("trip_distance", "mean"): "average_trip_distance",
        ("percent_tip", "mean"): "average_percent_tip",
        
    }, axis=1)
    data.to_csv(f"data/{value}_grouped_by_zone.csv")

grouped_zone = data

In [28]:
grouped_time = ddf.groupby(["pickup_hour", "pickup_weekday"])[["percent_tip"]].mean().compute().rename({"percent_tip": "average_percent_tip"})
grouped_time.to_csv("data/pickup_grouped_by_time.csv")

## Examples

To make use of the new datasets we can visualize all the data at once using a grouped heatmap

In [20]:
grouped_zone_and_time.hvplot.heatmap(
    x="dropoff_weekday", 
    y="dropoff_hour", 
    C="average_percent_tip",
    groupby="dropoff_taxizone_id", 
    responsive=True, min_height=600, cmap="viridis", clim=(0, 20),
    colorbar=False,
)

This dataset that is only grouped by zone can be paired with other information such as geography.

In [21]:
import geopandas as gpd

zones = gpd.read_file('./data/taxi_zones.shp').to_crs('epsg:4326')
joined = zones.join(grouped_zone, on="LocationID")

joined.hvplot(x="longitude", y="latitude", c="average_fare", 
              geo=True, tiles="CartoLight", cmap="fire", alpha=0.5,
              hover_cols=["zone", "borough"], 
              title="Average fare by dropoff location",
              height=600, width=800, clim=(0, 100))

### Payment Type Pie Chart

Other vizualisations can be contructed straight from the raw data and saved for embedding in the dashboard later.

In [15]:
payment_type = ddf.payment_type.value_counts().compute()

In [16]:
new_index = payment_type.index.map({
    1: "Credit card", 
    2: "Cash", 
    3: "No charge", 
    4: "Dispute", 
    5: "Unknown",
}).astype("category")

payment_type.index = new_index
payment_type.name = "value"
payment_type.index.name = "payment_type"
payment_type

payment_type
Credit card    248354074
Cash           111391849
No charge        1861534
Dispute           602493
Unknown           246646
Name: value, dtype: int64

In [17]:
from math import pi

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.transform import cumsum

output_notebook()

data = payment_type.reset_index()
data['angle'] = data['value']/data['value'].sum() * 2*pi
data["label"] = data.value.apply(lambda x: f"{x/1e6: .0f} M")
data["frac"] = data.angle.apply(lambda x: f"{x / (2*pi): .0%}")

data = data[:2]
data['color'] = ["thistle", "lightblue"]


p = figure(plot_height=350, plot_width=350, toolbar_location=None,
           x_range=(-.5, .5), y_range=(0, 2), title="Payment Type")

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', source=data)

p.text(x=[-0.2, 0.07], y=[1.4, 0.7], text=data["payment_type"].astype(str) + ":\n  " + data["label"] + "\n  " + data["frac"],
       text_align="left", text_baseline="top", text_font_size="15px")


p.title.text_font_size = "20px"
p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None
p.outline_line_width = 0

show(p)

In [18]:
from bokeh.io import export_svgs

p.output_backend = "svg"
export_svgs(p, filename="pie_chart.svg")

RuntimeError: Neither firefox and geckodriver nor a variant of chromium browser and chromedriver are available on system PATH. You can install the former with 'conda install -c conda-forge firefox geckodriver'.