TODO:

* [ ] create GD/SGD
* [ ] Score w/ loss

In [1]:
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import numpy as np

In [2]:
from tune import Damper
from pprint import pprint
import pandas as pd

In [3]:
from distributed import Client, LocalCluster
# client = await Client("localhost:6786", asynchronous=True)
# cluster = await LocalCluster(n_workers=3, threads_per_worker=2, asynchronous=True)
# client = await Client(cluster)
# client = await Client("localhost:8786", asynchronous=True)
client = await Client("localhost:8786", asynchronous=True)
# client = Client()
client

0,1
Client  Scheduler: tcp://localhost:8786  Dashboard: http://localhost:56517/status,Cluster  Workers: 1  Cores: 2  Memory: 17.18 GB


In [4]:
await client.upload_file("train.py")
await client.upload_file("tune.py")

{'tcp://127.0.0.1:56611': {'status': 'OK'}}

In [5]:
def test():
    from tune import Damper
    d = Damper()
    return True

f = client.submit(test)
res = await client.gather(f)
res

True

In [6]:
X, y = fetch_covtype(return_X_y=True, shuffle=False, random_state=0)

In [7]:
X.shape

(581012, 54)

In [8]:
kernel = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = kernel.fit_transform(X)

In [9]:
X_poly.shape

(581012, 1485)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_poly, y, random_state=42, train_size=200_000
)

In [11]:
X_train.nbytes / 1024**3

2.212822437286377

In [12]:
X_train.shape

(200000, 1485)

In [13]:
def _get_cnts_cols(X):
    cols = range(X.shape[1])
    uniqs = [np.unique(X[:, c]) for c in cols]
    cnts = [c for c, _u in zip(cols, uniqs) if len(_u) > 2]
    discrete_cols = [c for c, _u in zip(cols, uniqs) if len(_u) == 2]
    return cnts, discrete_cols

def normalize(X, scale, cnts, discrete):
    Y = scale.transform(X[:, cnts])
    Y2 = X[:, discrete].astype(bool).astype(int)  # one element is 30 (not 0/1)
    Z = np.hstack((Y2, Y))
    return Z

cnts, discrete = _get_cnts_cols(X_train)
scale = StandardScaler().fit(X_train[:, cnts])

print(X_train[:, cnts + discrete].shape)
X_train = normalize(X_train, scale, cnts, discrete)
X_test = normalize(X_test, scale, cnts, discrete)
print(X_train.shape)

uniqs = np.unique(X_train[:, :len(discrete)])
assert len(uniqs) == 2 and 0 <= uniqs.min() <= uniqs.max() <= 1

(200000, 613)
(200000, 613)


In [14]:
uniqs

array([0., 1.])

In [15]:
X_train.shape

(200000, 613)

In [16]:
X_test.shape

(381012, 613)

In [17]:
from scipy.stats import loguniform, uniform
from copy import copy

batch_pow = np.arange(5, 9 + 1).astype(int)
static_batch = (2 ** batch_pow).astype(int).tolist()

ibs = (2 ** batch_pow).astype(int).tolist()
mbs = (2 ** (batch_pow + 1)).astype(int).tolist()

lrs = loguniform(0.5e-4, 1e-1)
weight_decays = loguniform(1e-8, 1e-2)
momentums = uniform(0, 1)

In [18]:
base = {
    "max_batch_size": static_batch,  # 5 choices
    "weight_decay": weight_decays,  # cnts
    "momentum": momentums,  # cnts
    "lr": [0.9e-3],
    "scoring": ["loss"],
}

dwell = [1, 2, 5, 10, 20, 50, 100, 200]
padadamp_params = {
    **copy(base),
    "batch_growth_rate": loguniform(1e-3, 1e-1),
    "dwell": dwell,
    "initial_batch_size": ibs,
    "max_batch_size": mbs,
}

hsgd_params = {
    **copy(base),
    "initial_batch_size": [256],
    "max_batch_size": [2048],
    "lr": [0.9e-3],
    "batch_growth_rate": loguniform(1e-3, 1e-1),
}

geo_params = {
    **copy(base),
    "dampingdelay": [50, 100, 200, 500, 2000],
    "dampingfactor": loguniform(1, 10),
    "max_batch_size": [1024, 2048, 2048 * 2, 2048 * 4],
    "initial_batch_size": [64, 128, 256],
}

In [19]:
gd_params = {
    **copy(base),
    "max_batch_size": [len(X_train)],
}

In [20]:
gd_params

{'max_batch_size': [200000],
 'weight_decay': <scipy.stats._distn_infrastructure.rv_frozen at 0x12101eed0>,
 'momentum': <scipy.stats._distn_infrastructure.rv_frozen at 0x120e93150>,
 'lr': [0.0009],
 'scoring': ['loss']}

In [22]:
import numpy as np
n_examples = 75 * len(X_train)
n_params = 75

max_iter = n_params
chunk_size = np.round(n_examples / n_params) + 0
chunk_size / len(X_train)

1.0

In [23]:
import dask.array as da
X_train2 = da.from_array(X_train, chunks=(chunk_size, -1))
y_train2 = da.from_array(y_train, chunks=chunk_size)

X_train2.chunks

((200000,), (613,))

In [24]:
X_train2 = X_train2.persist()
y_train2 = y_train2.persist()

In [25]:
del X_test
del y_test

In [26]:
from tune import PadaDamp, Damper, GD, HSGD, GeoDamp

seed = 33

padadamp = PadaDamp(seed=seed)
hsgd = HSGD(seed=seed, name="hsgd")
gd = GD(seed=seed, name="gd")
sgd = Damper(seed=seed, name="sgd")
asgd = Damper(seed=seed, name="asgd", opt="asgd")
geo = GeoDamp(seed=seed, name="geo")

In [27]:
from sklearn.base import clone
p2 = clone(padadamp)
p2.initialize()
n_params = [v.nelement() for v in p2.model.module_.parameters()]
sum(n_params)

4912

In [28]:
from dask_ml.model_selection import HyperbandSearchCV

kwargs = dict(
    max_iter=max_iter, verbose=True, random_state=6, patience=2, tol=None,
)

models = {
#     "pada": (padadamp, padadamp_params),
#     "gd": (gd, gd_params),
#     "asgd": (sgd, base),
#     "hsgd": (hsgd, hsgd_params),
    "geo": (geo, geo_params),
}

searches = {
    name: HyperbandSearchCV(model, params, prefix=f"-{name}", **kwargs)
    for name, (model, params) in models.items()
}

In [29]:
# _search = searches["pada"]
_search = searches["geo"]
{k: v for k, v in _search.metadata.items() if not isinstance(v, list)}

{'partial_fit_calls': 920, 'n_models': 49}

In [30]:
X_train2.shape

(200000, 613)

In [31]:
# jobs = {name: search.fit(X_train2, y_train2) for name, search in searches.items()}
import asyncio
jobs = [
    search.fit(X_train2, y_train2) if name != "gd" else search.fit(X_train, y_train)
    for name, search in searches.items()
]

In [32]:
running_searches = asyncio.gather(*jobs)

[CV-geo, bracket=3] creating 27 models
[CV-geo, bracket=2] creating 12 models
[CV-geo, bracket=1] creating 6 models
[CV-geo, bracket=0] creating 4 models


In [34]:
running_searches

<_GatheringFuture pending>

[CV-geo, bracket=3] validation score of -0.6616 received after 14 partial_fit calls
[CV-geo, bracket=0] validation score of -0.6228 received after 17 partial_fit calls
[CV-geo, bracket=1] validation score of -0.6592 received after 15 partial_fit calls
[CV-geo, bracket=2] validation score of -0.6573 received after 14 partial_fit calls
[CV-geo, bracket=2] validation score of -0.6442 received after 16 partial_fit calls
[CV-geo, bracket=1] validation score of -0.6543 received after 17 partial_fit calls
[CV-geo, bracket=3] validation score of -0.6493 received after 16 partial_fit calls
[CV-geo, bracket=0] validation score of -0.6376 received after 19 partial_fit calls
[CV-geo, bracket=3] validation score of -0.6236 received after 18 partial_fit calls
[CV-geo, bracket=0] validation score of -0.6300 received after 21 partial_fit calls
[CV-geo, bracket=1] validation score of -0.6482 received after 19 partial_fit calls
[CV-geo, bracket=2] validation score of -0.6297 received after 18 partial_fi

In [35]:
finished_searches = running_searches.result()

In [37]:
import json

ident = "geo"
for finished_search in finished_searches:
    name = "data/" + finished_search.prefix[1:]
    pd.DataFrame(finished_search.cv_results_).to_csv(f"{name}-cv-results.csv", index=False)
    pd.DataFrame(finished_search.history_).to_csv(f"{name}-hist.csv", index=False)
    pd.DataFrame(finished_search.best_estimator_.history_).to_csv(f"{name}-best-est-hist.csv", index=False)
    with open(f"{name}-best-params.json", "w") as f:
        json.dump(finished_search.best_params_, f)
