In [1]:
from dask.distributed import Client, LocalCluster
import logging

cluster = LocalCluster(
    n_workers=28,
    threads_per_worker=8,
    silence_logs=logging.DEBUG
)

client = Client(cluster, heartbeat_interval=10000)

print(client.dashboard_link)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42841 instead
  http_address["port"], self.http_server.port
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:46817
distributed.scheduler - INFO -   dashboard at:           127.0.0.1:42841
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:33355'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:37043'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:43735'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:44543'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:38867'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:40687'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36923'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:34025'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:41

http://127.0.0.1:42841/status


In [2]:
import afqinsight as afqi
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle
import seaborn as sns

from datetime import datetime

from sklearn.base import clone
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.linear_model import LassoCV

from skopt import BayesSearchCV
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

print(afqi.__version__)

0.2.9.dev460469908


In [3]:
with open("hbn_regression_pure_lasso.pkl", "rb") as fp:
    results = pickle.load(fp)

In [4]:
results.keys()

dict_keys(['bagging_pure_lasso_trim0', 'bagging_target_transform_pure_lasso_trim0'])

In [5]:
X, y, groups, columns, subjects, classes = afqi.load_afq_data(
    "../data/raw/hbn_data",
    target_cols=["Age"],
    index_col="EID"
)

In [6]:
X.shape

(1597, 3600)

In [7]:
df_y = pd.read_csv("../data/raw/hbn_data/subjects.csv").drop(axis="rows", index=0)
df_y = df_y[["EID", "Age"]]
df_y = df_y.set_index("EID", drop=True)
df_subs = pd.DataFrame(index=subjects)
df_subs = df_subs.merge(df_y, how="left", left_index=True, right_index=True)
y = df_subs["Age"].astype(np.float64).values
nan_mask = np.logical_not(np.isnan(y))
y = y[nan_mask]
X = X[nan_mask, :]

In [8]:
print(X.shape)
print(y.shape)

(978, 3600)
(978,)


In [9]:
len(groups)

36

In [10]:
def get_cv_results(n_repeats=5, n_splits=10,
                   power_transformer=False, 
                   shuffle=False,
                   ensembler=None,
                   target_transform_func=None,
                   target_transform_inverse_func=None,
                   n_estimators=10,
                   trim_nodes=0,
                   square_features=False):
    if shuffle:
        rng = np.random.default_rng()
        y_fit = rng.permutation(y)
    else:
        y_fit = np.copy(y)
    
    if trim_nodes > 0:
        grp_mask = np.zeros_like(groups[0], dtype=bool)
        grp_mask[trim_nodes:-trim_nodes] = True
        X_mask = np.concatenate([grp_mask] * len(groups))

        groups_trim = []
        start_idx = 0
        
        for grp in groups:
            stop_idx = start_idx + len(grp) - 2 * trim_nodes
            groups_trim.append(np.arange(start_idx, stop_idx))
            start_idx += len(grp) - 2 * trim_nodes
            
        X_trim = X[:, X_mask]
    elif trim_nodes == 0:
        groups_trim = [grp for grp in groups]
        X_trim = np.copy(X)
    else:
        raise ValueError("trim_nodes must be non-negative.")
        
    if square_features:
        _n_samples, _n_features = X_trim.shape
        X_trim = np.hstack([X_trim, np.square(X_trim)])
        groups_trim = [np.concatenate([g, g + _n_features]) for g in groups_trim]

    cv = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=1729
    )

    cv_results = {}
    
    pipe_skopt = afqi.pipeline.make_base_afq_pipeline(
        imputer_kwargs={"strategy": "median"},
        power_transformer=power_transformer,
        scaler="standard",
        estimator=LassoCV,
        estimator_kwargs={
            "verbose": 0,
            "n_alphas": 50,
            "cv": 3,
            "n_jobs": 28,
            "max_iter": 500,
        },
        verbose=0,
        ensemble_meta_estimator=ensembler,
        ensemble_meta_estimator_kwargs={
            "n_estimators": n_estimators,
            "n_jobs": 1,
            "oob_score": True,
            "random_state": 1729,
        },
        target_transform_func=target_transform_func,
        target_transform_inverse_func=target_transform_inverse_func,
    )

    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X_trim, y_fit)):
        start = datetime.now()

        X_train, X_test = X_trim[train_idx], X_trim[test_idx]
        y_train, y_test = y_fit[train_idx], y_fit[test_idx]

        with joblib.parallel_backend("dask"):
            pipe_skopt.fit(X_train, y_train)

        cv_results[cv_idx] = {
            "pipeline": pipe_skopt,
            "train_idx": train_idx,
            "test_idx": test_idx,
            "y_pred": pipe_skopt.predict(X_test),
            "y_true": y_test,
            "test_mae": median_absolute_error(y_test, pipe_skopt.predict(X_test)),
            "train_mae": median_absolute_error(y_train, pipe_skopt.predict(X_train)),
            "test_r2": r2_score(y_test, pipe_skopt.predict(X_test)),
            "train_r2": r2_score(y_train, pipe_skopt.predict(X_train)),
        }
        
        if ensembler is None:
            if ((target_transform_func is not None)
                or (target_transform_inverse_func is not None)):
                cv_results[cv_idx]["coefs"] = pipe_skopt.named_steps["estimate"].regressor_.coef_
                cv_results[cv_idx]["alpha"] = pipe_skopt.named_steps["estimate"].regressor_.alpha_
            else:
                cv_results[cv_idx]["coefs"] = pipe_skopt.named_steps["estimate"].coef_
                cv_results[cv_idx]["alpha"] = pipe_skopt.named_steps["estimate"].alpha_
        else:
            if ((target_transform_func is not None)
                or (target_transform_inverse_func is not None)):
                cv_results[cv_idx]["coefs"] = [
                    est.coef_ for est
                    in pipe_skopt.named_steps["estimate"].regressor_.estimators_
                ]
                cv_results[cv_idx]["alpha"] = [
                    est.alpha_ for est
                    in pipe_skopt.named_steps["estimate"].regressor_.estimators_
                ]
            else:
                cv_results[cv_idx]["coefs"] = [
                    est.coef_ for est
                    in pipe_skopt.named_steps["estimate"].estimators_
                ]
                cv_results[cv_idx]["alpha"] = [
                    est.alpha_ for est
                    in pipe_skopt.named_steps["estimate"].estimators_
                ]

        print(f"CV index [{cv_idx:3d}], Elapsed time: ", datetime.now() - start)
        
    return cv_results, y_fit

In [11]:
results = {}

trim_nodes = 0
results[f"bagging_pure_lasso_trim{trim_nodes}"] = get_cv_results(
    n_splits=5, n_repeats=1, power_transformer=False,
    shuffle=False,
    trim_nodes=trim_nodes, square_features=False
)

results[f"bagging_target_transform_pure_lasso_trim{trim_nodes}"] = get_cv_results(
    n_splits=5, n_repeats=1, power_transformer=False,
    shuffle=False,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
    trim_nodes=trim_nodes, square_features=False
)

distributed.scheduler - INFO - Receive client connection: Client-worker-cf824aa2-64cc-11eb-8434-d3eefea15ecd
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-cf82289e-64cc-11eb-8434-42010a8a0002
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-cf829986-64cc-11eb-8434-d3eefea15ecd
distributed.core - INFO - Starting established connection


CV index [  0], Elapsed time:  0:00:38.451114


distributed.scheduler - INFO - Receive client connection: Client-worker-e67b91f8-64cc-11eb-839a-15a36b1bfbe2
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-e67ad9c8-64cc-11eb-839a-15a36b1bfbe2
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-e67f284a-64cc-11eb-8460-3902087a82b7
distributed.core - INFO - Starting established connection


CV index [  1], Elapsed time:  0:00:36.155566


distributed.scheduler - INFO - Receive client connection: Client-worker-fc064d2c-64cc-11eb-838f-25b39f005658
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-fc0634ec-64cc-11eb-8385-5fb860b2b25b
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-fc069a82-64cc-11eb-8385-5fb860b2b25b
distributed.core - INFO - Starting established connection
  positive)


CV index [  2], Elapsed time:  0:00:39.661680


distributed.scheduler - INFO - Receive client connection: Client-worker-13bb7f26-64cd-11eb-83ee-13686b423ee3
distributed.core - INFO - Starting established connection
  positive)


CV index [  3], Elapsed time:  0:00:34.063066


distributed.scheduler - INFO - Receive client connection: Client-worker-27f5722c-64cd-11eb-8463-95db7c851e5b
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-27f646f8-64cd-11eb-8463-95db7c851e5b
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-27fadd08-64cd-11eb-83a1-c5a7c7946784
distributed.core - INFO - Starting established connection


CV index [  4], Elapsed time:  0:00:37.359213


distributed.scheduler - INFO - Receive client connection: Client-worker-3e497392-64cd-11eb-83cb-4b30a3f64a12
distributed.core - INFO - Starting established connection


CV index [  0], Elapsed time:  0:00:35.410531


distributed.scheduler - INFO - Receive client connection: Client-worker-535bd2e8-64cd-11eb-8412-55fd59a9d7bf
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-535965de-64cd-11eb-8412-42010a8a0002
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-535f89e6-64cd-11eb-84e7-9d78c883ede1
distributed.core - INFO - Starting established connection


CV index [  1], Elapsed time:  0:00:34.770156


distributed.scheduler - INFO - Receive client connection: Client-worker-6831e106-64cd-11eb-8381-3984f0a51ba3
distributed.core - INFO - Starting established connection


CV index [  2], Elapsed time:  0:00:36.343892


distributed.scheduler - INFO - Receive client connection: Client-worker-7dc03b76-64cd-11eb-8486-657134553591
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-7dc0cf2e-64cd-11eb-8486-657134553591
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-7dc56400-64cd-11eb-845d-b7d70bd397d3
distributed.core - INFO - Starting established connection
  positive)


CV index [  3], Elapsed time:  0:00:35.264151


distributed.scheduler - INFO - Receive client connection: Client-worker-92de415e-64cd-11eb-83f1-7913ba57d9b9
distributed.core - INFO - Starting established connection


CV index [  4], Elapsed time:  0:00:44.550368


In [12]:
results.keys()

dict_keys(['bagging_pure_lasso_trim0', 'bagging_target_transform_pure_lasso_trim0'])

In [13]:
for metric in ["test_mae", "test_r2", "train_mae", "train_r2"]:
    for key, res in results.items():
        mean_metric = [cvr[metric] for cvr in res[0].values()]
        print(metric, key, np.mean(mean_metric))
    
    print()

test_mae bagging_pure_lasso_trim0 1.572916597914582
test_mae bagging_target_transform_pure_lasso_trim0 1.492790013136703

test_r2 bagging_pure_lasso_trim0 0.5442971561710012
test_r2 bagging_target_transform_pure_lasso_trim0 0.5711707567559646

train_mae bagging_pure_lasso_trim0 1.2856122810257329
train_mae bagging_target_transform_pure_lasso_trim0 1.1632404015259072

train_r2 bagging_pure_lasso_trim0 0.715749817450492
train_r2 bagging_target_transform_pure_lasso_trim0 0.7148864643896216



In [14]:
with open("hbn_regression_pure_lasso.pkl", "wb") as fp:
    pickle.dump(results, fp)