# Hyperparameter tuning

## Plain scikit-learn

# Load data and feature engineering

Train on a sample because we are using single-node Python

In [1]:
ML_TASK = 'tip'
TOOL = 'scikit'
MODEL = 'elastic_net'

In [2]:
import numpy as np
import datetime
import pandas as pd
from ml_utils import get_taxi_path, read_parquet_dir

taxi_path = get_taxi_path()

In [3]:
tip_train = read_parquet_dir(f'{taxi_path}/data/ml/tip_train_sample')
tip_train.shape

(659674, 10)

In [4]:
tip_train.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
0,28a18fa5fa2f44f29ffd98fc9159829d,238.0,132.0,0,29,7,19,7,1.0,0.199616
1,a6578145ff824f5fb94e90457b040883,236.0,246.0,6,28,9,16,153,1.0,0.130435
2,91726ecac3b44e8bbfea68d725f35556,90.0,148.0,6,28,22,44,166,6.0,0.166667
3,a3b0d14ad1644dd6b90f1af6be002e55,141.0,186.0,6,28,9,34,153,1.0,0.152299
4,70aa5a0c6bc147dd8553e201b63ba0fe,100.0,142.0,6,28,22,7,166,1.0,0.169231


# Run grid search

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV

numeric_feat = ['pickup_weekday',  'pickup_weekofyear', 'pickup_hour', 'pickup_week_hour', 'pickup_minute', 'passenger_count']
categorical_feat = ['pickup_taxizone_id', 'dropoff_taxizone_id']
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

pipeline = Pipeline(steps=[
    ('preprocess', ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_feat),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_feat),
    ])),
    ('clf', ElasticNet(normalize=False, max_iter=100)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.1, 0.1),
    'clf__alpha': [0.5, 1, 2],
}

grid_search = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')

In [6]:
%%time
_ = grid_search.fit(tip_train[features], tip_train[y_col])
grid_search.best_score_

Fitting 3 folds for each of 33 candidates, totalling 99 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:  5.6min finished


CPU times: user 1min 55s, sys: 3.33 s, total: 1min 58s
Wall time: 6min 9s


  positive)


-0.002940300894976634

In [7]:
grid_search.best_params_

{'clf__alpha': 0.5, 'clf__l1_ratio': 0.0}

## Predict on test set

And calculate metrics, save predictions and metrics to S3. This must be done in batches because it takes too much memory to predict on the whole test set.

In [8]:
import s3fs
fs = s3fs.S3FileSystem()

test_files = fs.glob(f'{taxi_path}/data/ml/tip_test/*.parquet')

In [9]:
%%time

out = []
for f in test_files:
    df = pd.read_parquet(f's3://{f}')
    pred = df[['id', y_col]].copy()
    pred.columns = ['id', 'actual']
    pred['predicted'] = grid_search.predict(df[features])
    out.append(pred)

CPU times: user 1min 58s, sys: 1min 41s, total: 3min 40s
Wall time: 3min 45s


In [10]:
%%time

preds = pd.concat(out)
preds.to_parquet(f'{taxi_path}/ml_results/predictions/{ML_TASK}__{TOOL}__{MODEL}/0.parquet', index=False)

CPU times: user 12.5 s, sys: 3.04 s, total: 15.5 s
Wall time: 1min 23s


In [11]:
from ml_utils import write_metric_df
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(preds.actual, preds.predicted, squared=False)
write_metric_df(taxi_path, ML_TASK, TOOL, MODEL, rmse)

Unnamed: 0,ml_task,tool,model,rmse
0,tip,scikit,elastic_net,0.05221
