# Hyperparameter tuning

## Plain scikit-learn

**Hardware**: r5.8xlarge (32 CPU, 256 GB RAM)

In [1]:
from ml_utils import MLUtils

ml_utils = MLUtils(
    ml_task='tip',
    tool='scikit',
    model='elastic_net',
)

# Load data and feature engineering

In [2]:
import numpy as np
import pandas as pd

In [3]:
%%time
tip_train = ml_utils.read_parquet_dir(f'{ml_utils.taxi_path}/data/ml/tip_train_sample')
tip_train.shape

CPU times: user 4.97 s, sys: 4.45 s, total: 9.42 s
Wall time: 27.6 s


(10994913, 10)

In [4]:
tip_train.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
0,28a18fa5fa2f44f29ffd98fc9159829d,238.0,132.0,0,29,7,19,7,1.0,0.199616
1,a6578145ff824f5fb94e90457b040883,236.0,246.0,6,28,9,16,153,1.0,0.130435
2,91726ecac3b44e8bbfea68d725f35556,90.0,148.0,6,28,22,44,166,6.0,0.166667
3,a3b0d14ad1644dd6b90f1af6be002e55,141.0,186.0,6,28,9,34,153,1.0,0.152299
4,70aa5a0c6bc147dd8553e201b63ba0fe,100.0,142.0,6,28,22,7,166,1.0,0.169231


<br>
Let's take a sample so we can parallelize the grid search (otherwise it would take quite a while)

In [5]:
sample = tip_train.sample(frac=0.1, replace=False, random_state=42)
sample.shape

(1099491, 10)

# Run grid search

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV

features = ml_utils.tip_vars.features
y_col = ml_utils.tip_vars.y_col

pipeline = Pipeline(steps=[
    ('preprocess', ColumnTransformer(transformers=[
        ('num', StandardScaler(), ml_utils.tip_vars.numeric_feat),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), ml_utils.tip_vars.categorical_feat),
    ])),
    ('clf', ElasticNet(normalize=False, max_iter=100)),
])

params = ml_utils.tip_vars.elastic_net_grid_search_params

grid_search = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')

In [7]:
%%time
_ = grid_search.fit(sample[features], sample[y_col])
grid_search.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:   17.6s remaining:    8.8s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   17.7s finished


CPU times: user 12.4 s, sys: 6.64 s, total: 19 s
Wall time: 32.9 s


-0.00290953839481807

In [8]:
grid_search.best_params_

{'clf__alpha': 0.5}

## Save model

`GridSearchCV` automatically fits the best paramemters to the full data and stores in `best_estimator_`

In [9]:
ml_utils.write_model(grid_search.best_estimator_)

## Predict on test set

And calculate metrics. Save predictions and metrics to S3.

In [10]:
import s3fs
fs = s3fs.S3FileSystem()

test_files = fs.glob(f'{ml_utils.taxi_path}/data/ml/tip_test/*.parquet')

In [11]:
%%time

amt_test = ml_utils.read_parquet_dir(f'{ml_utils.taxi_path}/data/ml/tip_test')
preds = amt_test[['id', y_col]].copy()
preds.columns = ['id', 'actual']
preds['predicted'] = grid_search.predict(amt_test[features])

CPU times: user 1min 42s, sys: 1min 40s, total: 3min 23s
Wall time: 3min 46s


In [12]:
preds.head()

Unnamed: 0,id,actual,predicted
0,8e8109754e3e4cb7879c4e9ee216d58d,0.097087,0.153381
1,a30e7c87866f417ab15dee5617f272a0,0.166667,0.153381
2,1a7a611d0809489d99a5120727e0476a,0.12,0.153381
3,736e84ca12a640cc858c210bd58f744c,0.089474,0.153381
4,f2c24299d9a34ce986b7a271c5cc80b2,0.0,0.153381


In [13]:
%%time
ml_utils.write_predictions(preds)

CPU times: user 10 s, sys: 2.46 s, total: 12.5 s
Wall time: 1min 3s


In [14]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(preds.actual, preds.predicted, squared=False)
ml_utils.write_metric_df('rmse', rmse)

Unnamed: 0,ml_task,tool,model,metric,value
0,tip,scikit,elastic_net,rmse,0.052238
