<img src="http://dask.readthedocs.io/en/latest/_images/dask_horizontal.svg"
     align="right"
     width="30%"
     alt="Dask logo">

DataFrames on a Cluster
=======================

<img src="http://pandas.pydata.org/_static/pandas_logo.png"
     align="left"
     width="30%"
     alt="Pandas logo">



## Read single dataframe from Pandas

In [None]:
import pandas as pd

In [None]:
%%time
df = pd.read_csv('/datasets/nyc-taxi/2016/yellow_tripdata_2016-01.csv', 
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [None]:
import cudf

In [None]:
%%time
cdf = cudf.read_csv('/datasets/nyc-taxi/2016/yellow_tripdata_2016-01.csv', 
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

## Parallelize with Dask

In [None]:
import dask.dataframe as dd
import dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

In [None]:
cluster = LocalCUDACluster()
client = Client(cluster)
client

In [None]:
# dask is lazy so this returns fairly quickly
df = dask_cudf.read_csv('/datasets/nyc-taxi/2016/yellow_tripdata_2016-01.csv', 
                 parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [None]:
%%time
df = df.persist()
_ = wait(df)

In [None]:
# GPU SUM
df.passenger_count.sum().compute()

### Tip Fraction, grouped by day-of-week and hour-of-day

In [None]:
df2 = df[(df.tip_amount > 0) & (df.fare_amount > 0)]
df2['tip_fraction'] = df2.tip_amount / df2.fare_amount

In [None]:
# Group df.tpep_pickup_datetime by dayofweek and hour
dayofweek = df2.groupby(df2.tpep_pickup_datetime.dt.weekday).tip_fraction.mean() 
hour = df2.groupby(df2.tpep_pickup_datetime.dt.hour).tip_fraction.mean()

dayofweek, hour = dask.persist(dayofweek, hour)
h = hour.compute()
dow = dayofweek.compute()

### Plot results

This requires matplotlib to be installed

In [None]:
%matplotlib inline

In [None]:
h.to_pandas().plot(figsize=(10, 6), title='Tip Fraction by Hour')

In [None]:
# test with rmm