# Hyperparameter tuning

## Single-node scikit-learn

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Scikit_learn_logo_small.svg/1200px-Scikit_learn_logo_small.svg.png" width="300">


In [1]:
import os

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'pickup_taxizone_id', 
    'dropoff_taxizone_id',
]
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

# Load data and feature engineering

Load a sample from a single month for this exercise

In [9]:
import os
import numpy as np
import datetime
import pandas as pd
import s3fs
import warnings
warnings.simplefilter("ignore")

import yaml
import snowflake.connector

creds = yaml.full_load(open('/home/jovyan/snowflake_creds.yml'))

conn = snowflake.connector.connect(
    warehouse='COMPUTE_WH',
    database='NYC_TAXI',
    schema='PUBLIC',
    **creds,
)
query = """
SELECT 
    pickup_taxizone_id,
    dropoff_taxizone_id,
    passenger_count,
    DIV0(TIP_AMOUNT, FARE_AMOUNT) as TIP_FRACTION,
    DAYOFWEEKISO(PICKUP_DATETIME) - 1 as PICKUP_WEEKDAY,
    WEEKOFYEAR(PICKUP_DATETIME) as PICKUP_WEEKOFYEAR,
    HOUR(PICKUP_DATETIME) as PICKUP_HOUR,
    (PICKUP_WEEKDAY * 24) + PICKUP_HOUR as PICKUP_WEEK_HOUR,
    MINUTE(PICKUP_DATETIME) as PICKUP_MINUTE
FROM taxi_yellow
WHERE
    date_trunc('MONTH', pickup_datetime) = '2019-01-01'
"""
cur = conn.cursor().execute(query)
taxi = cur.fetch_pandas_all()
taxi.columns = [x.lower() for x in taxi.columns]
taxi = taxi.sample(frac=0.01, replace=False)

In [10]:
print(f'Num rows: {len(taxi)}, Size: {taxi.memory_usage(deep=True).sum() / 1e6} MB')

Num rows: 76676, Size: 2.146928 MB


In [11]:
taxi_train = taxi[features + [y_col]].astype(float).fillna(-1)

In [12]:
taxi_train.head()

Unnamed: 0,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_week_hour,pickup_minute,passenger_count,pickup_taxizone_id,dropoff_taxizone_id,tip_fraction
7354688,2.0,2.0,20.0,68.0,13.0,2.0,238.0,166.0,0.11625
444654,1.0,4.0,12.0,36.0,57.0,2.0,237.0,237.0,0.368571
4551081,0.0,3.0,14.0,14.0,14.0,1.0,234.0,231.0,0.0
6489136,0.0,5.0,14.0,14.0,6.0,1.0,237.0,239.0,0.163333
3674344,2.0,4.0,21.0,69.0,48.0,2.0,161.0,229.0,0.123636


# Run grid search

Setting `n_jobs=-1` tells scikit-learn to use all available cores on this machine to train models.

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline(steps=[
    ('preprocess', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_feat),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_feat),
    ])),
    ('clf', ElasticNet(normalize=False, max_iter=100)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.1, 0.1),
    'clf__alpha': [0, 0.5, 1, 2],
}

grid_search = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')

In [14]:
%%time
_ = grid_search.fit(taxi_train[features], taxi_train[y_col])
grid_search.best_score_

Fitting 3 folds for each of 44 candidates, totalling 132 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 132 out of 132 | elapsed:  5.3min finished


CPU times: user 10.7 s, sys: 522 ms, total: 11.2 s
Wall time: 5min 28s


-0.021593900019805292

In [None]:
grid_search.best_params_

## Save model

`GridSearchCV` automatically fits the best paramemters to the full data and stores in `best_estimator_`

In [None]:
import cloudpickle

with open(f'{MODEL_PATH}/elastic_net_scikit.pkl', 'wb') as f:
    cloudpickle.dump(grid_search.best_estimator_, f)

## Calculate metrics on test set

Use a different month for test set

In [None]:
taxi_test = pd.read_csv(
    s3.open('s3://nyc-tlc/trip data/yellow_tripdata_2019-02.csv', mode='rb'),
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']
).sample(frac=0.01, replace=False)

taxi_test = prep_df(taxi_test)

In [None]:
from sklearn.metrics import mean_squared_error

preds = grid_search.predict(taxi_test[features])
mean_squared_error(taxi_test[y_col], preds, squared=False)