This notebook trains model. Each model has the same initialization, and only differs in the optimizer used (GeoDamp, GeoDamp LR, PadaDamp, PadaDamp LR, or Adagrad).

* Take in hyperparameters from `Tune.ipynb`.
* Initialize models with those hyperparameters;
    * (note: the same initialization is used for all optimizers).
* Train those models, record the test statistics, and save the models.

In [None]:
from copy import copy
from pprint import pprint

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split

In [None]:
from tune import PadaDamp, GeoDamp, GeoDampLR, Damper

## Data preprocessing

For continuous features, make them have unit variance and 0 mean. For the binary features, don't do preprocessing.

In [None]:

def normalize(X, scale, cnts=10):
    for c in range(X.shape[1]):
        if c < cnts:
            assert len(np.unique(X[:, c])) > 2
        else:
            assert 1 <= len(np.unique(X[:, c])) <= 2
    Y = scale.transform(X[:, :cnts])
    assert len(np.unique(Y[:, -1])) > 2

    Z = np.hstack((Y, X[:, cnts:]))
    return Z

In [None]:
def get_train_test_data(damper):
    X, y = fetch_covtype(return_X_y=True, random_state=2, shuffle=True, data_home=f"~/scikit_learn_data/{damper}/")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=42, train_size=25_000
    )
    cnts = 10
    scale = StandardScaler().fit(X_train[:, :cnts])

    X_train = normalize(X_train, scale, cnts=cnts)
    X_test = normalize(X_test, scale, cnts=cnts)
    return X_train, y_train, X_test, y_test
    
    

## Testing

Train on the entire train set, and test on testing set.

In [None]:
def train(damper, max_iter=200):
    X_train, y_train, X_test, y_test = get_train_test_data(type(damper).__name__)
    damper.initialize()
    test_score =  damper.score(X_test, y_test, return_dict=True, prefix="test_")
    train_score =  damper.score(X_train, y_train, return_dict=True, prefix="train_")
    meta = {
        "train_eg": len(y_train),
        "test_eg": len(y_test),
        "max_iter": max_iter,
        "damper_name": type(damper).__name__.lower(),
        **damper.get_params(),
    }
    data = [{"partial_fit_calls": 0, **test_score, **train_score, **meta, **copy(damper.meta_)}]
    pprint({k: data[-1][k] for k in ["test_score", "train_score", "test_loss", "train_loss"]})
    for k in range(max_iter):
        damper.partial_fit(X_train, y_train)
        test_score =  damper.score(X_test, y_test, return_dict=True, prefix="test_")
        train_score =  damper.score(X_train, y_train, return_dict=True, prefix="train_")
        datum = {
            "partial_fit_calls": k + 1,
            **meta,
            **test_score,
            **train_score,
            **copy(damper.meta_)
        }
        cols = ["damper_name", "partial_fit_calls", "test_score", "train_score", "batch_size", "lr_"]
        show = {k: datum[k]
                for k in cols
                if k in datum
               }
        show["epochs"] = datum["num_examples"] / meta["train_eg"]
        pprint(show)
        data.append(datum)
    return data

### Optimization hyperparameters

These parameters were found from Tune.ipynb.

In [None]:
padadamp_params = {
    'batch_growth_rate': 0.0020999350113148425,
    'dwell': 500,
    'initial_batch_size': 64,
    'lr': 0.0044576595887146865,
    'max_batch_size': 2048,
    'momentum': 0.8924479261291679,
    'weight_decay': 0.00012906483459130378
}

geodamp_params =  {
    'dampingdelay': 48.66414550165989,
    'dampingfactor': 4.283407572527471,
    'initial_batch_size': 32,
    'lr': 0.004618645820872018,
    'max_batch_size': 256,
    'momentum': 0.7320904056485099,
    'weight_decay': 0.001122178654655193
}

geodamplr_params = copy(geodamp_params)
padadamplr_params = copy(padadamp_params)
static_bs = 64
geodamplr_params = {
    k: v
    for k, v in geodamp_params.items()
    if k not in ["initial_batch_size", "max_batch_size"]
}
geodamplr_params["static_batch_size"] = static_bs
padadamplr_params["initial_batch_size"] = static_bs
padadamplr_params["max_batch_size"] = static_bs

For each optimizer, have the same initialization with a `seed=33`.

In [None]:
common = dict(seed=33)
padadamp = PadaDamp(**common, **padadamp_params)
geodamp = GeoDamp(**common, **geodamp_params)
geodamplr = GeoDampLR(**common, **geodamplr_params)
padadamplr = PadaDamp(**common, **padadamplr_params)
adagrad = Damper(**common, opt="adagrad", max_batch_size=static_bs)


Let's train in parallel:

In [None]:
from distributed import Client, as_completed
client = Client()

# If a Dask scheduler is at localhost:8786. Perform this by running these commands:
#
#     $ dask-scheduler
#     $ dask-worker --nprocs 20 --nthreads 2 localhost:8786
#
# client = Client("localhost:8786")

client

In [None]:
dampers = [padadamp, padadamplr, geodamp, geodamplr, adagrad]
futures = client.map(train, dampers, max_iter=300)

In [None]:
for k, future in enumerate(as_completed(futures)):
    data = future.result()
    print(k)
    pprint(data[-1])
    pd.DataFrame(data).to_csv(f"{k}-test-data.csv", index=False)