In [1]:
# from dask.distributed import Client, LocalCluster
# import logging

# cluster = LocalCluster(
#     n_workers=28,
#     threads_per_worker=8,
#     silence_logs=logging.DEBUG
# )

# client = Client(cluster, heartbeat_interval=10000)
# print(client.dashboard_link)

In [2]:
import afqinsight as afqi
import groupyr as gpr
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle
import seaborn as sns

from groupyr.transform import GroupRemover, GroupExtractor
from groupyr.decomposition import GroupPCA

from datetime import datetime

from sklearn.base import clone
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import explained_variance_score, mean_squared_error

from skopt import BayesSearchCV
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

print(afqi.__version__)

0.2.9.dev2565515056


In [3]:
X, y, groups, columns, group_names, subjects, classes = afqi.load_afq_data(
    "../data/raw/age_data",
    target_cols=["Age"],
)

In [4]:
label_sets = afqi.multicol2sets(pd.MultiIndex.from_tuples(columns, names=["metric", "tractID", "nodeID"]))

In [5]:
pyafq_bundles = [
    c for c in columns
    if c[1] not in ["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"]
]
pyafq_bundles = [
    [c] for c in np.unique([col[1] for col in pyafq_bundles])
]

In [6]:
gr = GroupRemover(
    select=["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"],
    groups=groups,
    group_names=group_names,
)
X_pyafq_bundles = gr.fit_transform(X)

In [7]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(len(label_sets))

(77, 10000)
(77, 9000)
10000


In [8]:
group_names = [
    grp for grp in group_names
    if "Cingulum Hippocampus" not in grp[1]
]

In [9]:
ge = GroupExtractor(
    select=["fa", "md"],
    groups=groups[:90],
    group_names=group_names
)
X_md_fa = ge.fit_transform(X_pyafq_bundles)

In [10]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(X_md_fa.shape)

(77, 10000)
(77, 9000)
(77, 3600)


In [11]:
groups_md_fa = groups[:36]

In [12]:
def get_cv_results(n_repeats=5, n_splits=10,
                   power_transformer=False, 
                   shuffle=False,
                   ensembler=None,
                   target_transform_func=None,
                   target_transform_inverse_func=None,
                   n_estimators=10):
    if shuffle:
        rng = np.random.default_rng()
        y_fit = rng.permutation(y)
    else:
        y_fit = np.copy(y)

    X_trim = np.copy(X_md_fa)
            
    cv = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=1729
    )

    imputer = SimpleImputer(strategy="median")
    gpca = GroupPCA(groups=groups_md_fa)
    
    cv_results = {}

    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X_trim, y_fit)):
        start = datetime.now()

        X_train, X_test = X_trim[train_idx], X_trim[test_idx]
        y_train, y_test = y_fit[train_idx], y_fit[test_idx]

        groups_pca = gpca.fit(imputer.fit_transform(X_train)).groups_out_

        pipe_skopt = afqi.make_afq_regressor_pipeline(
            imputer_kwargs={"strategy": "median"},
            use_cv_estimator=True,
            power_transformer=power_transformer,
            scaler="standard",
            groups=groups_pca,
            verbose=0,
            pipeline_verbosity=False,
            tuning_strategy="bayes",
            cv=3,
            n_bayes_points=9,
            n_jobs=28,
            l1_ratio=[0.0, 1.0],
            eps=5e-2,
            n_alphas=100,
            power_transformer_kwargs={
                "groups": groups_md_fa
            },
            ensemble_meta_estimator=ensembler,
            ensemble_meta_estimator_kwargs={
                "n_estimators": n_estimators,
                "n_jobs": 1,
                "oob_score": True,
                "random_state": 1729,
            },
            target_transform_func=target_transform_func,
            target_transform_inverse_func=target_transform_inverse_func,
        )

        pipe_skopt.fit(X_train, y_train)

        cv_results[cv_idx] = {
            "pipeline": pipe_skopt,
            "train_idx": train_idx,
            "test_idx": test_idx,
            "y_pred": pipe_skopt.predict(X_test),
            "y_true": y_test,
            "test_mae": median_absolute_error(y_test, pipe_skopt.predict(X_test)),
            "train_mae": median_absolute_error(y_train, pipe_skopt.predict(X_train)),
            "test_r2": r2_score(y_test, pipe_skopt.predict(X_test)),
            "train_r2": r2_score(y_train, pipe_skopt.predict(X_train)),
            "pca_components": pipe_skopt.named_steps["power_transform"].components_,
        }
        
        if ((target_transform_func is not None)
            or (target_transform_inverse_func is not None)):
            cv_results[cv_idx]["coefs"] = [
                est.coef_ for est
                in pipe_skopt.named_steps["estimate"].regressor_.estimators_
            ]
            cv_results[cv_idx]["alpha"] = [
                est.alpha_ for est
                in pipe_skopt.named_steps["estimate"].regressor_.estimators_
            ]
            cv_results[cv_idx]["l1_ratio"] = [
                est.l1_ratio_ for est
                in pipe_skopt.named_steps["estimate"].regressor_.estimators_
            ]
        else:
            cv_results[cv_idx]["coefs"] = [
                est.coef_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ]
            cv_results[cv_idx]["alpha"] = [
                est.alpha_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ]
            cv_results[cv_idx]["l1_ratio"] = [
                est.l1_ratio_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ]
        
        if ensembler is None:
            if ((target_transform_func is not None)
                or (target_transform_inverse_func is not None)):
                cv_results[cv_idx]["optimizer"] = pipe_skopt.named_steps["estimate"].regressor_.bayes_optimizer_                
            else:
                cv_results[cv_idx]["optimizer"] = pipe_skopt.named_steps["estimate"].bayes_optimizer_

        print(f"CV index [{cv_idx:3d}], Elapsed time: ", datetime.now() - start)
        
    return cv_results, y_fit

In [13]:
results = {}

results["bagging_target_transform_group_pca"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=GroupPCA,
    ensembler="serial-bagging", shuffle=False, n_estimators=20,
    target_transform_func=np.log, target_transform_inverse_func=np.exp,
)

CV index [  0], Elapsed time:  0:13:58.697415
CV index [  1], Elapsed time:  0:14:25.753417
CV index [  2], Elapsed time:  0:14:39.390576
CV index [  3], Elapsed time:  0:15:02.288144
CV index [  4], Elapsed time:  0:14:44.287568
CV index [  5], Elapsed time:  0:14:54.976586
CV index [  6], Elapsed time:  0:14:50.852941
CV index [  7], Elapsed time:  0:14:11.045175
CV index [  8], Elapsed time:  0:10:35.455490
CV index [  9], Elapsed time:  0:10:29.386885


In [14]:
results.keys()

dict_keys(['bagging_target_transform_group_pca'])

In [17]:
for key, res in results.items():
    test_mae = [cvr["test_mae"] for cvr in res[0].values()]
    train_mae = [cvr["train_mae"] for cvr in res[0].values()]
    test_r2 = [cvr["test_r2"] for cvr in res[0].values()]
    train_r2 = [cvr["train_r2"] for cvr in res[0].values()]
    print(key, "test", np.mean(test_mae))
    print(key, "test", np.mean(test_r2))

bagging_target_transform_group_pca test 4.203163993866996
bagging_target_transform_group_pca test 0.4590716909053481


In [18]:
with open("age_regression_group_pca.pkl", "wb") as fp:
    pickle.dump(results, fp)