In [None]:
import s3fs
import dask.dataframe as dd
import numpy as np
import pandas as pd
import os
import datetime
import hvplot.dask, hvplot.pandas

import warnings
warnings.simplefilter("ignore")

In [None]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

cluster = SaturnCluster(n_workers=10, scheduler_size='xlarge', worker_size='8xlarge', nthreads=32)
client = Client(cluster)
cluster

In [None]:
if 'TAXI_S3' not in os.environ:
    raise ValueError('Set TAXI_S3 environment variable to an S3 location that you have read/write access to')
taxi_path = os.environ['TAXI_S3']
data_path = f"{taxi_path}/data/taxi_parquet"

In [None]:
taxi = dd.read_parquet(data_path, engine='pyarrow', assume_missing=True)

In [None]:
# TODO: change to spatial join to pull in older data
ddf = taxi[(taxi.pickup_taxizone_id.notnull() & taxi.dropoff_taxizone_id.notnull()) &
           (taxi.pickup_datetime >= datetime.datetime(2017, 1, 1)) & 
           (taxi.pickup_datetime < datetime.datetime(2020, 1, 1))]
ddf

### Optional: downsample

To run this notebook quickly on a smaller cluster you can downsample and persist.

In [None]:
# ddf = ddf.sample(frac=0.01).persist()

## Augment data

We'll distill some features out of the datetime component of the data. This is similar to the feature engineering that is done in other places in this demo, but we'll only create the features that'll be most useful in the visuals. 

In [None]:
ddf["pickup_hour"] = ddf.pickup_datetime.dt.hour
ddf["dropoff_hour"] = ddf.dropoff_datetime.dt.hour
ddf["pickup_weekday"] = ddf.pickup_datetime.dt.weekday
ddf["dropoff_weekday"] = ddf.dropoff_datetime.dt.weekday
ddf["percent_tip"] = (ddf["tip_amount"] / ddf["fare_amount"]).replace([np.inf, -np.inf], np.nan) * 100

We'll take out the extreme high values since they disrupt the mean

In [None]:
ddf["percent_tip"] = ddf["percent_tip"].apply(lambda x: np.nan if x > 1000 else x)

Since the dataset is now small enough to fit in worker memory, we'll drop any unneeded columns and repartition and persist for easy access later.

In [None]:
%%time
ddf = ddf.drop([
    'vendor_id', 'rate_code_id', 'store_and_fwd_flag','pickup_latitude',
    'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'extra', 'mta_tax',
    'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge'
], axis=1)
ddf = ddf.repartition(npartitions=80).persist()
_ = wait(ddf)

In [None]:
ddf.columns

## Timeseries datasets

We'll resample to an hourly timestep so that we don't have to pass around so much data later on.

In [None]:
tip_ddf = ddf[["pickup_datetime", "percent_tip"]].set_index("pickup_datetime").dropna()
tips = tip_ddf.resample('1H').mean().compute()

tips.to_csv("./data/pickup_average_percent_tip_timeseries.csv")

In [None]:
fare_ddf = ddf[["pickup_datetime", "fare_amount"]].set_index("pickup_datetime").dropna()
fare = fare_ddf.resample('1H').mean().compute()

fare.to_csv("./data/pickup_average_fare_timeseries.csv")

## Aggregate datasets

Since our data is rather large and will mostly be viewed in grouped aggregates, we can do some aggregation now and save it off for use in plots later. 

In [None]:
for value in ["pickup", "dropoff"]:
    data = (ddf
            .groupby([
                f"{value}_taxizone_id", 
                f"{value}_hour",  
                f"{value}_weekday",
            ])
            .agg({
                "fare_amount": ["mean", "count", "sum"],
                "trip_distance": ["mean"],
                "percent_tip": ["mean"],
            })
            .compute()
           )
    data.columns = data.columns.to_flat_index()
    data = data.rename({
        ("fare_amount", "mean"): "average_fare",
        ("fare_amount", "count"): "total_rides",
        ("fare_amount", "sum"): "total_fare",
        ("trip_distance", "mean"): "average_trip_distance",
        ("percent_tip", "mean"): "average_percent_tip",
        
    }, axis=1).reset_index(level=[1, 2])
    data.to_csv(f"data/{value}_grouped_by_zone_and_time.csv")

grouped_zone_and_time = data

In [None]:
for value in ["pickup", "dropoff"]:
    data = (ddf
            .groupby([
                f"{value}_taxizone_id", 
            ])
            .agg({
                "fare_amount": ["mean", "count", "sum"],
                "trip_distance": ["mean"],
                "percent_tip": ["mean"],
            })
            .compute()
           )
    data.columns = data.columns.to_flat_index()
    data = data.rename({
        ("fare_amount", "mean"): "average_fare",
        ("fare_amount", "count"): "total_rides",
        ("fare_amount", "sum"): "total_fare",
        ("trip_distance", "mean"): "average_trip_distance",
        ("percent_tip", "mean"): "average_percent_tip",
        
    }, axis=1)
    data.to_csv(f"data/{value}_grouped_by_zone.csv")

grouped_zone = data

In [None]:
value = "pickup"
data = (ddf
        .groupby([
            f"{value}_hour", 
            f"{value}_weekday"
        ])
        .agg({
            "fare_amount": ["mean", "count", "sum"],
            "trip_distance": ["mean"],
            "percent_tip": ["mean"],
        })
        .compute()
       )
data.columns = data.columns.to_flat_index()
data = data.rename({
    ("fare_amount", "mean"): "average_fare",
    ("fare_amount", "count"): "total_rides",
    ("fare_amount", "sum"): "total_fare",
    ("trip_distance", "mean"): "average_trip_distance",
    ("percent_tip", "mean"): "average_percent_tip",

}, axis=1)

data.to_csv(f"data/{value}_grouped_by_time.csv")
grouped_time = data

## ML data

In [None]:
test_ddf = dd.read_parquet(f"{taxi_path}/data/ml/tip_test", engine="pyarrow")
test_ddf = test_ddf.set_index("id")

In [None]:
prediction_timeseries = test_ddf[["pickup_datetime", "tip_fraction"]].repartition(npartitions=10).persist()

In [None]:
import s3fs

fs = s3fs.S3FileSystem()
prediction_files = fs.glob(f"{taxi_path}/ml_results/predictions/tip*")

In [None]:
%%time
for f in prediction_files:
    name = os.path.basename(f)
    prediction = dd.read_parquet(f"s3://{f}", engine="pyarrow")
    if "predicted" in prediction.columns:
        print(f"Joining with {name}")
        prediction = prediction.set_index("id").repartition(npartitions=10)
        prediction_timeseries = prediction_timeseries.join(prediction[["predicted"]]).rename(columns={"predicted": name})

ml_ddf = prediction_timeseries.set_index("pickup_datetime")
ml_df = ml_ddf.resample("1H").mean().compute()

In [None]:
ml_df

In [None]:
ml_df.to_csv("./data/ml_prediction_timeseries.csv")

## Get shape files for dashboard

In [None]:
import zipfile
with fs.open('s3://nyc-tlc/misc/taxi_zones.zip') as f:
    with zipfile.ZipFile(f) as zip_ref:
        zip_ref.extractall('data')

In [None]:
fs.put('data', f'{taxi_path}/data/dashboard', recursive=True)

## Examples

To make use of the new datasets we can visualize all the data at once using a grouped heatmap

In [None]:
grouped_zone_and_time.hvplot.heatmap(
    x="dropoff_weekday", 
    y="dropoff_hour", 
    C="average_percent_tip",
    groupby="dropoff_taxizone_id", 
    responsive=True, min_height=600, cmap="viridis", clim=(0, 20),
    colorbar=False,
)

This dataset that is only grouped by zone can be paired with other information such as geography.

In [None]:
import geopandas as gpd

zones = gpd.read_file('./data/taxi_zones.shp').to_crs('epsg:4326')
joined = zones.join(grouped_zone, on="LocationID")

joined.hvplot(x="longitude", y="latitude", c="average_fare", 
              geo=True, tiles="CartoLight", cmap="fire", alpha=0.5,
              hover_cols=["zone", "borough"], 
              title="Average fare by dropoff location",
              height=600, width=800, clim=(0, 100))

### Payment Type Pie Chart

Other vizualisations can be contructed straight from the raw data and saved for embedding in the dashboard later.

In [None]:
payment_type = ddf.payment_type.value_counts().compute()

In [None]:
new_index = payment_type.index.map({
    1: "Credit card", 
    2: "Cash", 
    3: "No charge", 
    4: "Dispute", 
    5: "Unknown",
}).astype("category")

payment_type.index = new_index
payment_type.name = "value"
payment_type.index.name = "payment_type"
payment_type

In [None]:
from math import pi

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.transform import cumsum

output_notebook()

data = payment_type.reset_index()
data['angle'] = data['value']/data['value'].sum() * 2*pi
data["label"] = data.value.apply(lambda x: f"{x/1e6: .0f} M")
data["frac"] = data.angle.apply(lambda x: f"{x / (2*pi): .0%}")

data = data[:2]
data['color'] = ["thistle", "lightblue"]


p = figure(plot_height=350, plot_width=350, toolbar_location=None,
           x_range=(-.5, .5), y_range=(0, 2), title="Payment Type")

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', source=data)

p.text(x=[-0.2, 0.07], y=[1.4, 0.7], text=data["payment_type"].astype(str) + ":\n  " + data["label"] + "\n  " + data["frac"],
       text_align="left", text_baseline="top", text_font_size="15px")


p.title.text_font_size = "20px"
p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None
p.outline_line_width = 0

show(p)

In [None]:
# Note this doesn't work on Saturn at this time.

from bokeh.io import export_svgs

p.output_backend = "svg"
export_svgs(p, filename="pie_chart.svg")