In [1]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client
import datashader as ds
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade
hv.extension('bokeh')

In [2]:
client = Client()

In [3]:
NYC_TAXI_DATA = '/home/spotter/Documents/job/coiled_assessment/data/nyc-taxi/part-*.csv'

# Task 1: Load & Clean Data; Visualize

In [4]:
df = dd.read_csv(NYC_TAXI_DATA, dtype={'store_and_fwd_flag': str})
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2019-01-01 00:46:40,2019-01-01 00:53:20,1.0,1.5,1.0,N,151,239,1.0,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1.0,2019-01-01 00:59:47,2019-01-01 01:18:59,1.0,2.6,1.0,N,239,246,1.0,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2.0,2018-12-21 13:48:30,2018-12-21 13:52:40,3.0,0.0,1.0,N,236,236,1.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2.0,2018-11-28 15:52:25,2018-11-28 15:55:45,5.0,0.0,1.0,N,193,193,2.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2.0,2018-11-28 15:56:57,2018-11-28 15:58:33,5.0,0.0,2.0,N,193,193,2.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


## Basic filters

In [5]:
# Clearly negative values for fairs and tips are unreasonable
# Setting an intuitive guess that a 10k USD cap on total fair will suffice
mask = (df['tip_amount'] > 0) & (df['total_amount'] > 0) & (df['total_amount'] < 10000)
filtered = df[mask]

In [6]:
filtered[['total_amount', 'tip_amount']].describe().compute()

Unnamed: 0,total_amount,tip_amount
count,58232670.0,58232670.0
mean,20.15605,3.179025
std,15.25548,2.885673
min,0.11,0.01
25%,12.66,1.95
50%,16.56,2.66
75%,83.1975,12.1725
max,1199.16,787.25


## Let's see what the data looks like

### Tip Amount vs. Total Amount

One could reasonably expect tips to linearly correlate with the total fair amount, let's see how well that holds

In [7]:
plot_width  = int(750)
plot_height = int(plot_width//1.2)
plot_options = hv.Options(width=plot_width, height=plot_height)
opts.defaults(opts.Points(width=plot_width, height=plot_height, size=5, color='blue'))

In [8]:
sample = filtered.sample(frac=1e-5).compute()

In [9]:
hv.Points(sample, ['total_amount', 'tip_amount'])

Maybe that's a little too sampled. Try again

In [10]:
sample = filtered.sample(frac=1e-3).compute()

In [11]:
hv.Points(sample, ['total_amount', 'tip_amount'])

Okay, so our linear correlation hypothesis _kind_ holds, but there's definitely other things at work here.

Let's look at two more (though there are other avenues to explore).

1. Tip amount vs Travel distance
2. Tip amount vs Pick up time

### Tip amount vs Travel distance

In [12]:
hv.Points(sample, ['trip_distance', 'tip_amount'])

Hmm, this is even _less_ linearly correlated.

### Tip amount vs Pick up time

In [15]:
sample['pickup_hour'] = dd.to_datetime(sample['tpep_pickup_datetime']).dt.hour
hv.Points(sample, ['pickup_hour', 'tip_amount'])

# TODO - Visualize fully filtered dataset with Datashader

In [16]:
client.shutdown()