In [1]:
from dask.distributed import Client, LocalCluster
import logging

cluster = LocalCluster(
    n_workers=28,
    threads_per_worker=8,
    silence_logs=logging.DEBUG
)

client = Client(cluster, heartbeat_interval=10000)
print(client.dashboard_link)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 35713 instead
  http_address["port"], self.http_server.port
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:38277
distributed.scheduler - INFO -   dashboard at:           127.0.0.1:35713
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:41639'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:33573'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:40721'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:43521'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:44927'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:42013'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:42775'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:33475'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36

http://127.0.0.1:35713/status


In [2]:
import afqinsight as afqi
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle
import seaborn as sns

from datetime import datetime

from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.linear_model import LassoCV, ElasticNetCV

from skopt import BayesSearchCV
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

print(afqi.__version__)

0.2.9.dev460469908


In [3]:
X, y, groups, columns, subjects, classes = afqi.load_afq_data(
    "../data/raw/age_data",
    target_cols=["Age"],
)

In [4]:
label_sets = afqi.multicol2sets(pd.MultiIndex.from_tuples(columns, names=["metric", "tractID", "nodeID"]))

In [5]:
X_md_fa = afqi.select_groups(X, [["fa"], ["md"]], label_sets)

In [6]:
groups_md_fa = groups[:40]

In [13]:
def get_cv_results(n_repeats=5, n_splits=10,
                   power_transformer=False, 
                   shuffle=False,
                   ensembler=None,
                   target_transform_func=None,
                   target_transform_inverse_func=None,
                   n_estimators=10):
    if shuffle:
        rng = np.random.default_rng()
        y_fit = rng.permutation(y)
    else:
        y_fit = np.copy(y)

    cv = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=1729
    )

    cv_results = {}
    
    pipe = afqi.pipeline.make_base_afq_pipeline(
        imputer_kwargs={"strategy": "median"},
        power_transformer=power_transformer,
        scaler="standard",
        estimator=LassoCV,
        estimator_kwargs={
            "verbose": 0,
            "n_alphas": 200,
            "cv": 3,
            "n_jobs": 28,
            "max_iter": 5000,
        },
        ensemble_meta_estimator=ensembler,
        ensemble_meta_estimator_kwargs={
            "n_estimators": n_estimators,
            "n_jobs": 1,
            "oob_score": True,
            "random_state": 1729,
        },
        target_transform_func=target_transform_func,
        target_transform_inverse_func=target_transform_inverse_func,
    )

    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X_md_fa, y_fit)):
        start = datetime.now()

        X_train, X_test = X_md_fa[train_idx], X_md_fa[test_idx]
        y_train, y_test = y_fit[train_idx], y_fit[test_idx]

        with joblib.parallel_backend("dask"):
            pipe.fit(X_train, y_train)

        cv_results[cv_idx] = {
            "pipeline": pipe,
            "train_idx": train_idx,
            "test_idx": test_idx,
            "y_pred": pipe.predict(X_test),
            "y_true": y_test,
            "test_mae": median_absolute_error(y_test, pipe.predict(X_test)),
            "train_mae": median_absolute_error(y_train, pipe.predict(X_train))
        }
        
        print(f"CV index [{cv_idx:3d}], Elapsed time: ", datetime.now() - start)
        
    return cv_results, y_fit

In [14]:
results = {}
results["no_power_transform"] = get_cv_results(
    n_splits=10, n_repeats=5, power_transformer=False, shuffle=False
)
results["shuffle_no_transform"] = get_cv_results(
    n_splits=10, n_repeats=3, power_transformer=False, shuffle=True
)

# with open("age_regression_lasso.pkl", "rb") as fp:
#     results = pickle.load(fp)

CV index [  0], Elapsed time:  0:00:10.168060
CV index [  1], Elapsed time:  0:00:06.457292
CV index [  2], Elapsed time:  0:00:05.296155
CV index [  3], Elapsed time:  0:00:05.386212
CV index [  4], Elapsed time:  0:00:10.290863
CV index [  5], Elapsed time:  0:00:07.727080
CV index [  6], Elapsed time:  0:00:09.348220
CV index [  7], Elapsed time:  0:00:04.322436
CV index [  8], Elapsed time:  0:00:07.867198
CV index [  9], Elapsed time:  0:00:05.842299
CV index [ 10], Elapsed time:  0:00:05.344998
CV index [ 11], Elapsed time:  0:00:07.970502
CV index [ 12], Elapsed time:  0:00:11.064658
CV index [ 13], Elapsed time:  0:00:06.692730
CV index [ 14], Elapsed time:  0:00:06.100153
CV index [ 15], Elapsed time:  0:00:04.179891
CV index [ 16], Elapsed time:  0:00:06.950540
CV index [ 17], Elapsed time:  0:00:05.878240
CV index [ 18], Elapsed time:  0:00:06.580265
CV index [ 19], Elapsed time:  0:00:06.618419
CV index [ 20], Elapsed time:  0:00:09.707801
CV index [ 21], Elapsed time:  0:0

In [15]:
results["target_log_transform"] = get_cv_results(
    n_splits=10, n_repeats=5, power_transformer=False, shuffle=False,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
)

results["shuffle_target_log_transform"] = get_cv_results(
    n_splits=10, n_repeats=3, power_transformer=False, shuffle=True,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
)

CV index [  0], Elapsed time:  0:00:11.569760
CV index [  1], Elapsed time:  0:00:06.735639
CV index [  2], Elapsed time:  0:00:05.005268
CV index [  3], Elapsed time:  0:00:04.278545
CV index [  4], Elapsed time:  0:00:11.160258
CV index [  5], Elapsed time:  0:00:08.647319
CV index [  6], Elapsed time:  0:00:06.863602
CV index [  7], Elapsed time:  0:00:06.043430
CV index [  8], Elapsed time:  0:00:09.846008
CV index [  9], Elapsed time:  0:00:05.781704
CV index [ 10], Elapsed time:  0:00:05.195436
CV index [ 11], Elapsed time:  0:00:05.273486
CV index [ 12], Elapsed time:  0:00:08.276016
CV index [ 13], Elapsed time:  0:00:05.775949
CV index [ 14], Elapsed time:  0:00:06.168996
CV index [ 15], Elapsed time:  0:00:05.549727
CV index [ 16], Elapsed time:  0:00:09.081567
CV index [ 17], Elapsed time:  0:00:12.931280
CV index [ 18], Elapsed time:  0:00:08.848602
CV index [ 19], Elapsed time:  0:00:05.061774
CV index [ 20], Elapsed time:  0:00:08.563393
CV index [ 21], Elapsed time:  0:0

In [16]:
with open("age_regression_lasso.pkl", "wb") as fp:
    pickle.dump(results, fp)

In [17]:
results.keys()

dict_keys(['no_power_transform', 'shuffle_no_transform', 'target_log_transform', 'shuffle_target_log_transform'])

In [20]:
for key, res in results.items():
    test_accuracies = [cvr["test_mae"] for cvr in res[0].values()]
    train_accuracies = [cvr["train_mae"] for cvr in res[0].values()]
    print(key, "mean", np.mean(test_accuracies))
    print(key, "std", np.std(test_accuracies))

no_power_transform mean 4.911651713980643
no_power_transform std 2.1794284381788485
shuffle_no_transform mean 9.831946509985348
shuffle_no_transform std 2.107599601843023
target_log_transform mean 3.7080493910409933
target_log_transform std 2.109180013045811
shuffle_target_log_transform mean 7.67703843451133
shuffle_target_log_transform std 1.93403821029267


In [21]:
import itertools

def mean_over_combinations(results):
    length = len(results)
    mean_results = {}
    for r in range(1, length + 1):
        mean_results[r] = [
            np.mean([res["yhat"].values for res in comb], axis=0)
            for comb in itertools.combinations(results, r=r)
        ]
        
    return mean_results

def mae_over_combinations(results):
    mean_results = mean_over_combinations(results)
    mean_mae = []
    for r in mean_results.keys():
        mean_mae += [
            {
                "n_repeats": r,
                "mae": median_absolute_error(results[0]["y_true"].values, res)
            } for res in mean_results[r]
        ]
        
    return pd.DataFrame(mean_mae)

def get_mae_ensemble_dataframe(cv_results, y_true):    
    test_preds = {
        idx: pd.Series(
            data=cvr["y_pred"],
            index=cvr["test_idx"],
            name="yhat"
        )
        for idx, cvr in cv_results.items()
    }
    
    df_ytest = {
        idx: pd.DataFrame(test_preds[idx]).merge(
            pd.DataFrame(y_true, columns=["y_true"]),
            left_index=True,
            right_index=True,
            how="left"
        ) for idx in test_preds.keys()
    }
    
    mae_scores = [
        median_absolute_error(_df["y_true"].values, _df["yhat"].values)
        for _df in df_ytest.values()
    ]
    
    repeats = [
        pd.concat([df_ytest[i] for i in range(x * 10, (x + 1) * 10)]).sort_index()
        for x in range(len(cv_results) // 10)
    ]
    
    return mae_over_combinations(repeats)

In [None]:
df_mae = {
    key: get_mae_ensemble_dataframe(res[0], y)
    for key, res in results.items()
}

In [None]:
sns.catplot(x="n_repeats", y="mae", data=df_mae["no_power_transform"])

In [None]:
sns.catplot(x="n_repeats", y="mae", data=df_mae["target_log_transform"])

In [None]:
# results["bagging"] = get_cv_results(
#     n_splits=10, n_repeats=1, power_transformer=False,
#     ensembler="serial-bagging", shuffle=False, n_estimators=20,
# )
# results["bagging_shuffle"] = get_cv_results(
#     n_splits=10, n_repeats=1, power_transformer=False,
#     ensembler="serial-bagging", shuffle=True, n_estimators=10,
# )

In [None]:
results["bagging_target_transform"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=False,
    ensembler="serial-bagging", shuffle=False, n_estimators=20,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
)
results["bagging_shuffle_target_transform"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=False,
    ensembler="serial-bagging", shuffle=True, n_estimators=10,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
)

In [None]:
for key, res in results.items():
    test_accuracies = [cvr["test_mae"] for cvr in res[0].values()]
    train_accuracies = [cvr["train_mae"] for cvr in res[0].values()]
    print(key, "test", np.mean(test_accuracies))
    print(key, "train", np.mean(train_accuracies))

In [None]:
with open("age_regression.pkl", "wb") as fp:
    pickle.dump(results, fp)