This notebook will run simulations that use parameters:

* use [2, 4, 8, 16, 20, 24] workers
* replay history from each of the 6 runs.
* Use `tol in [True, max_iter // 6]`

For the model,

* have one `partial_fit` call take 1 second (20781 examples, about 1/3 of a dataset)
    * sec / dataset = 3
* have one score call take 1.5 seconds (66500 examples, about 1 dataset)
    * sec / dataset = 1.5

It will write to `sim/2019-06-28/`.

Another notebook will pull this data in and visualize.

Plan: rerun for each of unique worker value.

In [1]:
import dask_ml
dask_ml.__file__

'/mnt/ws/home/ssievert/anaconda3/lib/python3.7/site-packages/dask_ml/__init__.py'

In [2]:
from distributed import Client, LocalCluster

In [3]:
# cluster = LocalCluster(n_workers=2)
# client = Client(cluster)
# client = Client("localhost:8786")
client = Client(n_workers=4, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:19663  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 270.25 GB


In [4]:
from sklearn.base import BaseEstimator
import numpy as np
import toolz

def _same_params(p1, p2):
    p1keys = set(p1.keys())
    p2keys = set(p2.keys())
    assert p1keys == p2keys
    for k in p1.keys():
        if isinstance(p1[k], float) and not np.allclose(p1[k], p2[k]):
            return False
        elif p1[k] != p2[k]:
            return False
    return True
    
def _get_model_history(history, params):
    model_histories = toolz.groupby("model_id", history)
    same_params = {k: _same_params(v[0]["params"], params) for k, v in model_histories.items()}
    assert sum(same_params.values()) == 1
    model_id = [k for k, v in same_params.items() if v][0]
    return model_histories[model_id]
    
params = {
    'module_init': ['xavier_uniform_',
                     'xavier_normal_',
                     'kaiming_uniform_',
                     'kaiming_normal_',
                    ],
    'module_activation': ['ReLU', 'LeakyReLU', 'ELU', 'PReLU'],
    'optimizer': ["SGD"] * 5 + ["Adam"] * 2,
    'batch_size': [32, 64, 128, 256, 512],
    'optimizer_lr': np.logspace(1, -1.5, num=1000),
    'optimizer_weight_decay': [0]*200 + np.logspace(-5, -3, num=1000).tolist(),
    'optimizer_nesterov': [True],
    'optimizer_momentum': np.linspace(0, 1, num=1000),
    'train_split': [None],
}
   

In [5]:
from distributed.utils import sleep
from copy import deepcopy
from sklearn.utils import check_random_state
 
class ReplayModel(BaseEstimator):
    def __init__(
        self,
        history,
        module_init=None,
        module_activation=None,
        optimizer=None,
        optimizer_lr=None,
        batch_size=None,
        optimizer_weight_decay=None,
        optimizer_nesterov=None,
        optimizer_momentum=None,
        train_split=None,
    ):
        self.history = history
        self._pf_calls = 0
        
        self.module_init = module_init
        self.module_activation = module_activation
        
        self.optimizer = optimizer
        self.optimizer_lr = optimizer_lr
        self.batch_size = batch_size
        
        self.optimizer_weight_decay = optimizer_weight_decay
        self.optimizer_nesterov = optimizer_nesterov
        self.optimizer_momentum = optimizer_momentum
        
        self.train_split = train_split
        
    
    def _get_formatted_keys(self):
        params = self.get_params()
        params.pop("history")
        new_params = {}
        for k, v in params.items():
            if "module" in k:
                k = "module__" + "_".join(k.split("_")[1:])
            if "optimizer" in k and k != "optimizer":
                k = "optimizer__" + "_".join(k.split("_")[1:])
            new_params[k] = v
        return new_params
    
    def partial_fit(self, X, y):
        self._pf_calls += 1
        sleep(1)
        return self
    
    def fit(self, X, y):
        return self
    
    def score(self, X, y):
        model_history = _get_model_history(self.history, self._get_formatted_keys())
        valid = [h for h in model_history if h["partial_fit_calls"] == self._pf_calls]
        sleep(1.5)
        return valid[0]["score"]

In [6]:
num_workers = len(client.scheduler_info()["workers"])
num_workers

4

In [7]:
import json
from sys import getsizeof

def _get_history(today, random_state):
    with open(f"{today}/-hyperband-{random_state}-history.json", "r") as f:
        history = json.load(f)

    from copy import deepcopy

    params_recorded = set()
    out = []
    for h in history:
        out.append({
                k: h[k] for k in ["bracket", "model_id", "partial_fit_calls", "score"]
        })
        if h["model_id"] not in params_recorded:
            out[-1]["params"] = h["params"]
            params_recorded.update({h["model_id"]})
    return deepcopy(out)

In [8]:
from dask_ml.model_selection import HyperbandSearchCV
import scipy.stats
from sklearn.datasets import make_classification
import pandas as pd
from distributed import get_task_stream
from time import time
import warnings

def process(random_state, patience):
    today = "sim/2019-06-28-"
    pre = today + f"-workers={num_workers}-rs={random_state}-patience={patience}"
    history = _get_history(random_state=random_state, today="2019-06-28")
    _hash = history[-1]["score"]
    print(f"Starting rs={random_state} patience={patience} ({_hash:0.4f})... ")
    model = ReplayModel(history)
    
    if patience:
        tol = 0.001
    else:
        tol = np.nan
    print(patience, tol)

    search = HyperbandSearchCV(
        model,
        params,
        max_iter=243,
        random_state=random_state,
        patience=True,
        tol=tol,
    )
    X, y = make_classification()

    start = time()
    with get_task_stream() as ts:
        # Gives lots of warnings about garbage collection when starting jobs; none while jobs running
        search.fit(X, y)
    print(f"...done in {time() - start:0.2f}s")
        
    pd.DataFrame(ts.data).to_msgpack(pre + "-times.msgpack")
    with open(pre + "-history.json", 'w') as f:
        json.dump(search.history_, f)

In [None]:
for random_state in [400, 401, 402, 403, 404, 405]:
    for patience in [True, False]:
        process(random_state, patience)

Starting rs=400 patience=True (-0.0947)... 
True 0.001




...done in 1407.27s
Starting rs=400 patience=False (-0.0947)... 
False nan




...done in 1568.53s
Starting rs=401 patience=True (-0.0809)... 
True 0.001




...done in 1483.06s
Starting rs=401 patience=False (-0.0809)... 
False nan




...done in 1563.87s
Starting rs=402 patience=True (-0.0729)... 
True 0.001




...done in 1396.98s
Starting rs=402 patience=False (-0.0729)... 
False nan




...done in 1557.79s
Starting rs=403 patience=True (-0.0645)... 
True 0.001




### 32 workers
* 400
    * p=T: 387.79
    * p=F: 390.59
* 401:
    * p=T: 386.41
    * p=F: 399.03
* 402:
    * p=T: 386.23
    * p=F: 394.98
* 403:
    * p=T: 387.60
    * p=F: 403.52
* 404:
    * p=T: 394.93s
    * p=F: 395.26
* 405:
    * p=T: 402.44s
    * p=F: 398.64

### 25 workers
4 workers.

10 workers are serial towards the end.

* 400
    * p=T: 400.85s
    * p=F: 423.90

### 8 workers
* 400
    * p=T: 740.51
    * p=F: 880.76
* 401
    * p=T: 806.99
    * p=F: 881.97
* 402:
    * p=T: 758.08s
    * p=F: 879.38s
* 403
    * p=T: 733.64s
    * p=F: 830.70s
* 404
    * p=T: 782.63s
    * p=F: 832.84s
* 405
    * p=T: 861.43s
    * p=F: 862.22s

### 16 workers
* 400:
    * p=T: 468.32, 484.23, 485.07
    * p=F: 513.42, 513.42
* 401:
    * p=T: 487.75, 505.91
    * p=F: 510.94
* 402
    * p=T: 466.80
    * p=F: 531.35
* 403
    * p=T: 459.30
    * p=F: 530.60
* 404
    * p=T: 509.88
    * p=F: 524.06
* 405:
    * p=T: 531.11
    * p=T: 526.37

Model creation lasts less than a second (with `_create_model`).