# Hyperparameter tuning

## Dask cluster

<img src="https://docs.dask.org/en/latest/_images/dask_horizontal.svg" width="400">

In [1]:
import os
import numpy as np

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'pickup_taxizone_id', 
    'dropoff_taxizone_id',
]
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

# Initialize Dask Cluster

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 3
cluster = SaturnCluster(n_workers=n_workers, scheduler_size='medium', worker_size='large', nthreads=2)
client = Client(cluster)
cluster

[2020-09-11 20:24:18] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

Open the dashboard (link above ^) and watch it when you execute some commands, you'll see which tasks are running across the cluster.

If you created your cluster here in this notebook, it might take a few minutes for all your nodes to become available. You can run the chunk below to block until all nodes are ready.

>**Pro tip**: Create and/or start your cluster from the "Dask" page in Saturn if you want to get a head start!

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Done!


# Load data and feature engineering

Load a sample from a single month for this exercise. Note we are loading the data with Dask now (`dd.read_csv` vs. `pd.read_csv`)

In [4]:
import numpy as np
import datetime
import pandas as pd
import dask.dataframe as dd
import s3fs
import warnings
warnings.simplefilter("ignore")

In [5]:
import yaml
import snowflake.connector

creds = yaml.full_load(open('/home/jovyan/snowflake_creds.yml'))

# get connection info
conn_info = {
    'warehouse': 'COMPUTE_WH',
    'database': 'NYC_TAXI',
    'schema': 'PUBLIC',
    **creds,
}
conn = snowflake.connector.connect(**conn_info)
q = "select DISTINCT(DATE(PICKUP_DATETIME)) as date from taxi_yellow"
cur = conn.cursor().execute(q)
dates = cur.fetch_pandas_all()['DATE'].tolist()

In [6]:
from dask import delayed


query = """
SELECT
    pickup_taxizone_id,
    dropoff_taxizone_id,
    passenger_count,
    DIV0(TIP_AMOUNT, FARE_AMOUNT) as TIP_FRACTION,
    DAYOFWEEKISO(PICKUP_DATETIME) - 1 as PICKUP_WEEKDAY,
    WEEKOFYEAR(PICKUP_DATETIME) as PICKUP_WEEKOFYEAR,
    HOUR(PICKUP_DATETIME) as PICKUP_HOUR,
    (PICKUP_WEEKDAY * 24) + PICKUP_HOUR as PICKUP_WEEK_HOUR,
    MINUTE(PICKUP_DATETIME) as PICKUP_MINUTE
FROM taxi_yellow
WHERE
    date(pickup_datetime) = %s
"""


@delayed
def load(conn_info, query, day, frac=0.01):
    # q = query % str(day)
    conn = snowflake.connector.connect(**conn_info)
    cur = conn.cursor().execute(query, str(day))
    taxi = cur.fetch_pandas_all().sample(frac=0.01, replace=True)
    taxi.columns = [x.lower() for x in taxi.columns]
    return taxi

    

In [7]:
import datetime as dt
_dates = [x for x in dates if x is not None and x >= dt.date(2019, 1, 1) and x < dt.date(2019, 2, 1)]

In [8]:
import dask.dataframe as dd
taxi = dd.from_delayed([load(conn_info, query, day) for day in _dates])

In [9]:
taxi_train = taxi[features + [y_col]].astype(float).fillna(-1)

In [10]:
taxi_train.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,pickup_taxizone_id,dropoff_taxizone_id,tip_fraction
28719,6.0,2.0,2.0,146.0,16.0,1.0,234.0,48.0,0.24
129730,6.0,2.0,15.0,159.0,14.0,1.0,262.0,236.0,0.226667
113329,6.0,2.0,13.0,157.0,12.0,1.0,186.0,48.0,0.0
61998,6.0,2.0,9.0,153.0,18.0,1.0,164.0,246.0,0.22
180926,6.0,2.0,18.0,162.0,5.0,1.0,162.0,164.0,0.22


# Run grid search

We use the preprocessing and `GridSearchCV` classes from dask-ml, but still use the scikit-learn `ElasticNet` model.

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from dask_ml.compose import ColumnTransformer
from dask_ml.preprocessing import StandardScaler, DummyEncoder, Categorizer
from dask_ml.model_selection import GridSearchCV

pipeline = Pipeline(steps=[
    ('categorize', Categorizer(columns=categorical_feat)),
    ('onehot', DummyEncoder(columns=categorical_feat)),
    ('scale', ColumnTransformer(transformers=[('num', StandardScaler(), numeric_feat)])),
    ('clf', ElasticNet(normalize=False, max_iter=100)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.1, 0.1),
    'clf__alpha': [0, 0.5, 1, 2],
}

grid_search = GridSearchCV(pipeline, params, cv=3, scoring='neg_mean_squared_error')

Open up the Dask dashboard after you run the cell below, you'll see the grid search in action!

In [12]:
%%time
_ = grid_search.fit(taxi_train[features], taxi_train[y_col])
grid_search.best_score_

CPU times: user 411 ms, sys: 28.5 ms, total: 440 ms
Wall time: 28.4 s


-0.021441720427451495

In [13]:
grid_search.best_params_

{'clf__alpha': 0, 'clf__l1_ratio': 0.0}

## Save model

`GridSearchCV` automatically fits the best paramemters to the full data and stores in `best_estimator_`

In [14]:
import cloudpickle

with open(f'{MODEL_PATH}/elastic_net_dask.pkl', 'wb') as f:
    cloudpickle.dump(grid_search.best_estimator_, f)

## Calculate metrics on test set

Use a different month for test set

In [27]:
import datetime as dt
_dates = [x for x in dates if x is not None and x >= dt.date(2019, 2, 1) and x < dt.date(2019, 3, 1)]
taxi_test = dd.from_delayed([load(conn_info, query, day) for day in _dates])

In [28]:
from dask.distributed import wait

In [29]:
taxi_test = taxi_test.persist()
_ = wait(taxi_test)

In [30]:
from sklearn.metrics import mean_squared_error

preds = grid_search.predict(taxi_test[features])
mean_squared_error(taxi_test[y_col], preds, squared=False)

7.871446346944487