# Random Forest

## Dask + RAPIDS 

 <img src="https://images.exxactcorp.com/CMS/landing-page/resource-center/supported-software/deep-learning/rapids/Rapids-Logo-lg.png" width="400" />
 
**Hardware**: 20 nodes, g4dn.xlarge (4 CPU, 16GB RAM; 1 GPU, 16GB GPU RAM)

# Load data

In [1]:
from dask.distributed import Client, wait
from dask import persist
from dask_saturn import SaturnCluster

cluster = SaturnCluster(n_workers=20, scheduler_size='xlarge', worker_size='g4dnxlarge')
client = Client(cluster)
cluster

[2020-07-30 14:58:17] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [2]:
from IPython.display import display, HTML

gpu_links = f'''
<b>GPU Dashboard links</b>
<ul>
<li><a href="{client.dashboard_link}/individual-gpu-memory" target="_blank">GPU memory</a></li>
<li><a href="{client.dashboard_link}/individual-gpu-utilization" target="_blank">GPU utilization</a></li>
</ul>
'''
display(HTML(gpu_links))

In [3]:
import dask_cudf
import s3fs

In [4]:
fs = s3fs.S3FileSystem(anon=True)
files = [f"s3://{x}" for x in fs.ls('s3://nyc-tlc/trip data/')
         if 'yellow' in x and ('2019' in x or '2018' in x or '2017' in x)]
cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
        'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
        'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

taxi = dask_cudf.read_csv(files, 
                          assume_missing=True,
                          parse_dates=[1,2], 
                          usecols=cols, 
                          storage_options={'anon': True})

In [5]:
%%time
len(taxi)

CPU times: user 131 ms, sys: 5.79 ms, total: 137 ms
Wall time: 25.5 s


300700143

# Feature engineering

In [6]:
taxi['pickup_weekday'] = taxi.tpep_pickup_datetime.dt.weekday
taxi['pickup_hour'] = taxi.tpep_pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.tpep_pickup_datetime.dt.minute
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour
taxi['store_and_fwd_flag'] = (taxi.store_and_fwd_flag == 'Y').astype(float)
taxi = taxi.fillna(-1)

In [7]:
features = ['pickup_weekday', 'pickup_hour', 'pickup_minute',
            'pickup_week_hour', 'passenger_count', 'VendorID', 
            'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 
            'DOLocationID']

X = taxi[features].astype('float32')  # convert to float32 for GPUs
y = taxi['total_amount']

In [8]:
%%time
X, y = persist(X, y)
_ = wait([X, y])
len(X)

CPU times: user 614 ms, sys: 30.2 ms, total: 644 ms
Wall time: 23.1 s


300700143

# Train random forest!

In [9]:
from cuml.dask.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_depth=10, seed=42)

In [11]:
%%time
_ = rf.fit(X, y)

CPU times: user 54.8 ms, sys: 441 µs, total: 55.2 ms
Wall time: 1.02 s
