# NYC Taxi dashboard with cuxfilter and dask_cudf

## Import dask_cuda, dask

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

client = Client(LocalCUDACluster())

### Display total available GPU resources

In [2]:
gpu_info = client.cluster.scheduler_info['workers'].values()

print(f"Total GPUs: {len(gpu_info)}\n")
for gpu in gpu_info:
    mem = f"{gpu['memory_limit']/(1024 **3):.2f} GB"
    gpu_mem = f"{gpu['gpu']['memory-total']/(1024 **3):.2f} GB"
    gpu_name = gpu['gpu']['name']
    print(f"{gpu_name}:\n\tMemory assigned: {mem}, Total Memory: {gpu_mem}")

Total GPUs: 2

NVIDIA TITAN RTX:
	Memory assigned: 15.63 GB, Total Memory: 24.00 GB
Quadro GV100:
	Memory assigned: 15.63 GB, Total Memory: 32.00 GB


## Import cuxfilter

In [None]:
import cuxfilter
from bokeh import palettes
from cuxfilter.layouts import feature_and_double_base
from pyproj import Proj, Transformer
import dask_cudf, cudf

In [None]:
#update data_dir if you have downloaded datasets elsewhere
DATA_DIR = '../data'

## Download required datasets

In [None]:
! curl https://data.rapids.ai/viz-data/nyc_taxi.tar.gz --create-dirs -o $DATA_DIR/nyc_taxi.tar.gz

In [None]:
from cuxfilter.sampledata import datasets_check
datasets_check('nyc_taxi', base_dir=DATA_DIR)

## Load required datasets using dask_cudf

Using `dask_cudf.DataFrame.persist()` before passing it to cuxfilter.DataFrame turns a lazy Dask collection into a Dask collection with the same metadata, but now with the results fully computed or actively computing in the background. 

Passing a `dask_cudf.DataFrame` without `persist()` is also supported.

## preprocess the data

In [25]:
cudf_df = dask_cudf.read_csv('../data/nyc_taxi.csv')

def apply_transformation(df):
    # Apply transformation
    transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
    df['dropoff_x'], df['dropoff_y'] = transform_4326_to_3857.transform(
                                                    df['dropoff_latitude'].values_host, df['dropoff_longitude'].values_host
                                                )

    df = df.drop(['dropoff_latitude', 'dropoff_longitude'], axis=1)
    df = df.dropna(axis=0)

    # Filter over Manhattan
    df = df[
        (df.dropoff_x > -8239910.23) & (df.dropoff_x < -8229529.24) & (df.dropoff_y > 4968481.34) & (df.dropoff_y < 4983152.92)
    ]

    df.tpep_pickup_datetime = cudf.to_datetime(df.tpep_pickup_datetime, format="%Y-%m-%d")
    df.tpep_dropoff_datetime = cudf.to_datetime(df.tpep_dropoff_datetime, format="%Y-%m-%d")
    return df


cudf_df = cudf_df.map_partitions(apply_transformation).persist()

cudf_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,dropoff_x,dropoff_y
0,2,2015-01-15,2015-01-15,1,1.59,-73.993896,40.750111,1,N,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05,-8234835.0,4975627.0
1,1,2015-01-10,2015-01-10,1,3.3,-74.001648,40.724243,1,N,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8,-8237021.0,4976875.0
3,1,2015-01-10,2015-01-10,1,0.5,-74.009087,40.713818,1,N,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,-8238124.0,4971127.0
4,1,2015-01-10,2015-01-10,1,3.0,-73.971176,40.762428,1,N,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,-8238108.0,4974457.0
5,1,2015-01-10,2015-01-10,1,9.0,-73.874374,40.774048,1,N,1,27.0,0.5,0.5,6.7,5.33,0.3,40.33,-8236193.0,4976740.0


## Read the dataset

In [None]:
cux_df = cuxfilter.DataFrame.from_dataframe(cudf_df)

## Define charts

In [None]:
chart1 = cuxfilter.charts.scatter(x='dropoff_x',
                                 y='dropoff_y',
                                 aggregate_fn='mean',aggregate_col='payment_type', pixel_shade_type='log', legend_position='top_right',
                                 tile_provider="CartoDark", x_range=(-8239910.23,-8229529.24), y_range=(4968481.34,4983152.92))

chart2 = cuxfilter.charts.bar('passenger_count', data_points=9)
chart3 = cuxfilter.charts.bar('tpep_pickup_datetime')
chart4 = cuxfilter.charts.date_range_slider('tpep_dropoff_datetime')

## Create a dashboard object

In [None]:
d = cux_df.dashboard([chart1, chart2, chart3], sidebar=[chart4], layout=feature_and_double_base, theme=cuxfilter.themes.dark, title= 'NYC TAXI DATASET')

## Starting the dashboard

1. d.show('current_notebook_url:current_notebook_port') remote dashboard

2. d.app() inline within the notebook cell

Incase you need to stop the server:

- d.stop()

In [None]:
# d.show() # notebook_url="http://localhost:8888"
# d.app() # run the dashboard within the notebook cell

![nyc-taxi](../../docs/_images/nyc-taxi.png)

## Export the queried data into a dataframe

In [None]:
queried_df = d.export()