<img style="float: right" src="img/saturn.png" width="300" />

# Scaling Machine Learning in Python

## Hyperparameters

- joblib
- `GridSearchCV`

In [1]:
import s3fs
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

s3 = s3fs.S3FileSystem(anon=True)

In [2]:
taxi = pd.read_csv(
    s3.open(
        's3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
        mode='rb',
    ),
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']
)

In [3]:
raw_features = [
    'tpep_pickup_datetime', 
    'passenger_count', 
    'tip_amount', 
    'fare_amount',
]
features = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
label = 'tip_fraction'

In [4]:
def prep_df(taxi_df):
    '''
    Generate features from a raw taxi dataframe.
    '''
    df = taxi_df[taxi_df.fare_amount > 0][raw_features].copy()  # avoid divide-by-zero
    df[label] = df.tip_amount / df.fare_amount
     
    df['pickup_weekday'] = df.tpep_pickup_datetime.dt.weekday
    df['pickup_weekofyear'] = df.tpep_pickup_datetime.dt.weekofyear
    df['pickup_hour'] = df.tpep_pickup_datetime.dt.hour
    df['pickup_week_hour'] = (df.pickup_weekday * 24) + df.pickup_hour
    df['pickup_minute'] = df.tpep_pickup_datetime.dt.minute
    df = df[features + [label]].astype(float).fillna(-1)
    
    return df

In [5]:
taxi_feat = prep_df(taxi)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('clf', ElasticNet(normalize=False, max_iter=100, l1_ratio=0)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.1, 0.1),
    'clf__alpha': [0, 0.5, 1, 2],
}

grid_search = GridSearchCV(
    pipeline, 
    params, 
    cv=3, 
    n_jobs=-1,
    verbose=1, 
    scoring='neg_mean_squared_error',
)

```python
_ = grid_search.fit(X_train, y_train)
```

In [6]:
from dask_saturn import SaturnCluster
from dask.distributed import Client

cluster = SaturnCluster()
client = Client(cluster)
client.wait_for_workers(3)

[2020-11-09 00:41:30] INFO - dask-saturn | Cluster is ready


In [None]:
%%time
import joblib

with joblib.parallel_backend('dask'):
    _ = grid_search.fit(X_train, y_train)

In [7]:
import dask.dataframe as dd
import numpy as np

In [9]:
taxi_feat_dd = dd.from_pandas(taxi_feat, npartitions=20)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet

from dask_ml.preprocessing import StandardScaler
from dask_ml.metrics import mean_squared_error
from dask_ml.model_selection import GridSearchCV

pipeline = Pipeline(steps=[
    ('scale', StandardScaler()),
    ('clf', ElasticNet(normalize=False, max_iter=100, l1_ratio=0)),
])

params = {
    'clf__l1_ratio': np.arange(0, 1.1, 0.1),
    'clf__alpha': [0, 0.5, 1, 2],
}

grid_search = GridSearchCV(
    pipeline, 
    params, 
    cv=3, 
    scoring='neg_mean_squared_error',
)

In [11]:
%%time
_ = grid_search.fit(
    taxi_feat_dd[features], 
    taxi_feat_dd[label],
)

CPU times: user 1.06 s, sys: 618 ms, total: 1.68 s
Wall time: 2min 24s
