This notebook finds hyperparameters for the different optimizers. It does the following:

* Finds high performing hyperparameters `PadaDamp` and `GeoDamp`, methods to increase the batch size.
    * It does this by performing a train/test split, then a train/validation split on the train set.
* It writes the parameters to stdout for GeoDamp and PadaDamp.

Then in `Train.ipynb`, these parameters are used to create GeoDamp LR and PadaDamp LR optimizers and evaluates on the same test set as above.

In [1]:
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split

In [2]:
from tune import Damper
from pprint import pprint
import pandas as pd

In [3]:
from distributed import Client
# client = await Client("localhost:6786", asynchronous=True)
client = Client("localhost:8786")
client

0,1
Client  Scheduler: tcp://localhost:8786  Dashboard: http://localhost:8787/status,Cluster  Workers: 20  Cores: 40  Memory: 135.13 GB


In [4]:
X, y = fetch_covtype(return_X_y=True, random_state=2, shuffle=True)

In [5]:
data = fetch_covtype()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, train_size=25_000
)


In [7]:
X_train.shape

(25000, 54)

In [8]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def normalize(X, scale, cnts=10):
    for c in range(X.shape[1]):
        if c < cnts:
            assert len(np.unique(X[:, c])) > 2
        else:
            assert 1 <= len(np.unique(X[:, c])) <= 2
    Y = scale.transform(X[:, :cnts])
    assert len(np.unique(Y[:, -1])) > 2

    Z = np.hstack((Y, X[:, cnts:]))
    return Z

cnts = 10
scale = StandardScaler().fit(X_train[:, :cnts])

X_train = normalize(X_train, scale, cnts=cnts)
X_test = normalize(X_test, scale, cnts=cnts)

In [9]:
from tune import PadaDamp, GeoDamp, Damper

In [10]:
X_train.shape

(25000, 54)

In [11]:
X_test.shape

(556012, 54)

In [12]:
from scipy.stats import loguniform, uniform
from copy import copy

ints = np.arange(5, 11 + 1)
batch_pow = np.array(ints.tolist())
ibs = (2 ** batch_pow).astype(int)
mbs = ibs.copy()
lrs = loguniform(0.5e-4, 1e-1)
weight_decays = loguniform(1e-8, 1e-2)
momentums = uniform(0, 1)

In [13]:
base = {
    "initial_batch_size": ibs,
    "max_batch_size": mbs,
    "weight_decay": weight_decays,
    "lr": lrs,
    "momentum": momentums,
}

padadamp_params = {
    "batch_growth_rate": loguniform(1e-4, 1e-1),
    "dwell": [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000],
    **copy(base),
}

geodamp_params = {
    "dampingfactor": loguniform(1, 20),
    "dampingdelay": loguniform(1, 60),
    **copy(base),
}

In [15]:
import numpy as np
f = 300
n_examples = f * len(X_train)
n_params = f

max_iter = n_params
chunk_size = np.round(n_examples / (1 * n_params))
chunk_size

25000.0

In [16]:
# max_iter = 9
# chunk_size = 10_000

In [17]:
import dask.array as da
X_train2 = da.from_array(X_train, chunks=(chunk_size, -1))
y_train2 = da.from_array(y_train, chunks=chunk_size)

X_train2.chunks

((25000,), (54,))

In [18]:
X_train2 = X_train2.persist()
y_train2 = y_train2.persist()

In [19]:
padadamp = PadaDamp(seed=33)
geodamp = GeoDamp(seed=33)

In [20]:
from sklearn.base import clone
p2 = clone(padadamp)
p2.initialize()
n_params = [v.nelement() for v in p2.model.module_.parameters()]
sum(n_params)

34406

In [21]:
from dask_ml.model_selection import HyperbandSearchCV

kwargs = dict(max_iter=max_iter, verbose=True, random_state=6)

psearch = HyperbandSearchCV(padadamp, padadamp_params, prefix="-pada", **kwargs)
gsearch = HyperbandSearchCV(geodamp, geodamp_params, prefix="-geo", **kwargs)

In [22]:
{k: v for k, v in psearch.metadata.items() if not isinstance(v, list)}

{'partial_fit_calls': 7928, 'n_models': 415}

In [23]:
X_train2.shape

(25000, 54)

In [24]:
import pandas as pd
from pprint import pprint

dir = "tuning-hyperband-out"
for name, search in {"geo": gsearch, "pada": psearch}.items():
    _ = search.fit(X_train2, y_train2)
    print("\nbest score =", search.best_score_)
    pprint(search.best_params_)
    print("best index =", search.best_index_)
    print("best model_id =", search.cv_results_["model_id"][search.best_index_], "\n")

    pd.DataFrame(search.cv_results_).to_csv(f"{dir}/{name}-cv-results.csv", index=False)
    pd.DataFrame(search.history_).to_csv(f"{dir}/{name}-hist.csv", index=False)
    pd.DataFrame(search.best_estimator_.history_).to_csv(f"{dir}/{name}-est-hist.csv", index=False)


[CV-geo, bracket=5] creating 243 models
[CV-geo, bracket=4] creating 98 models
[CV-geo, bracket=3] creating 41 models
[CV-geo, bracket=2] creating 18 models
[CV-geo, bracket=1] creating 9 models
[CV-geo, bracket=0] creating 6 models


KeyboardInterrupt: 