# Hyperparameter tuning

## Dask + scikit-learn

<img src="https://docs.dask.org/en/latest/_images/dask_horizontal.svg" width="500">

In [1]:
from dask.distributed import Client
from dask_saturn import SaturnCluster

cluster = SaturnCluster(
    scheduler_size='2xlarge',
    worker_size='2xlarge',
    nthreads=8,
    n_workers=3,
)
client = Client(cluster)
cluster

[2020-12-04 17:54:56] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

# Load data and feature engineering

In [2]:
import numpy as np
import datetime
import dask.dataframe as dd

taxi = dd.read_csv(
    's3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
    storage_options={'anon': True},
).sample(frac=0.1, replace=False)

In [3]:
taxi['pickup_weekday'] = taxi.tpep_pickup_datetime.dt.weekday
taxi['pickup_weekofyear'] = taxi.tpep_pickup_datetime.dt.weekofyear
taxi['pickup_hour'] = taxi.tpep_pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.tpep_pickup_datetime.dt.minute
taxi['pickup_year_seconds'] = (taxi.tpep_pickup_datetime - datetime.datetime(2019, 1, 1, 0, 0, 0)).dt.seconds
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour
taxi['passenger_count'] = taxi.passenger_count.astype(float).fillna(-1)
taxi = taxi.fillna(value={'VendorID': 'missing', 'RatecodeID': 'missing', 'store_and_fwd_flag': 'missing' })

taxi = taxi.persist()

# Run grid search

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from dask_ml.compose import ColumnTransformer
from dask_ml.preprocessing import StandardScaler, DummyEncoder, Categorizer
from dask_ml.model_selection import GridSearchCV

numeric_feat = ['pickup_weekday',  'pickup_weekofyear', 'pickup_hour', 'pickup_minute', 
                'pickup_year_seconds', 'pickup_week_hour', 'passenger_count']
categorical_feat = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 
                    'PULocationID', 'DOLocationID']
features = numeric_feat + categorical_feat
y_col = 'total_amount'

pipeline = Pipeline(steps=[
    ('categorize', Categorizer(columns=categorical_feat)),
    ('onehot', DummyEncoder(columns=categorical_feat)),
    ('scale', ColumnTransformer(
        transformers=[('num', StandardScaler(), numeric_feat)], 
        remainder='passthrough',
    )),
    ('clf', ElasticNet(normalize=False, max_iter=100)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.01, 0.01),
    'clf__alpha': [0, 0.5, 1, 2],
}

grid_search = GridSearchCV(pipeline, params, cv=3)

## 3 nodes

In [5]:
cluster.scale(3)
client.wait_for_workers(3)

In [6]:
%%time
_ = grid_search.fit(taxi[features], taxi[y_col])

CPU times: user 5.4 s, sys: 578 ms, total: 5.98 s
Wall time: 1h 3min 54s


## Scale up to 10 nodes

In [7]:
cluster.scale(10)
client.wait_for_workers(10)

In [8]:
%%time
_ = grid_search.fit(taxi[features], taxi[y_col])

CPU times: user 3.14 s, sys: 275 ms, total: 3.41 s
Wall time: 19min 49s


## Scale up to 20 nodes

In [9]:
cluster.scale(20)
client.wait_for_workers(20)

In [10]:
%%time
_ = grid_search.fit(taxi[features], taxi[y_col])

CPU times: user 2.57 s, sys: 257 ms, total: 2.83 s
Wall time: 10min 48s
