In [1]:
from dask.distributed import Client, LocalCluster
import logging

cluster = LocalCluster(
    n_workers=10,
    threads_per_worker=8,
    silence_logs=logging.DEBUG
)

client = Client(cluster, heartbeat_interval=10000)
print(client.dashboard_link)

distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:34895
distributed.scheduler - INFO -   dashboard at:            127.0.0.1:8787
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:33265'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:35111'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:41469'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36331'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:33751'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:45999'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36413'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36277'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:42049'
distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:36035'
distributed.scheduler - INFO - Register worker <Worker '

http://127.0.0.1:8787/status


In [2]:
cluster.scheduler_address

'tcp://127.0.0.1:34895'

In [3]:
import afqinsight as afqi
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle
import seaborn as sns

from datetime import datetime

from sklearn.base import clone
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, roc_auc_score

from skopt import BayesSearchCV
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

print(afqi.__version__)

0.2.9.dev460469908


In [4]:
X, y, groups, columns, subjects, classes = afqi.load_afq_data(
    "../data/raw/als_data",
    target_cols=["class"],
    label_encode_cols=["class"],
)

In [5]:
label_sets = afqi.multicol2sets(pd.MultiIndex.from_tuples(columns, names=["metric", "tractID", "nodeID"]))

In [6]:
pyafq_bundles = [
    c for c in columns
    if c[1] not in ["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"]
]
pyafq_bundles = [
    [c] for c in np.unique([col[1] for col in pyafq_bundles])
]

In [7]:
X_pyafq_bundles = afqi.select_groups(
    X,
    pyafq_bundles,
    label_sets
)

In [8]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(len(label_sets))

(48, 16000)
(48, 14400)
16000


In [9]:
columns = [
    c for c in columns
    if c[1] not in ["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"]
]
label_sets = afqi.multicol2sets(pd.MultiIndex.from_tuples(columns, names=["metric", "tractID", "nodeID"]))

X_md_fa = afqi.select_groups(
    X_pyafq_bundles,
    [["fa"], ["md"]],
    label_sets
)

In [10]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(X_md_fa.shape)

(48, 16000)
(48, 14400)
(48, 3600)


In [11]:
groups_md_fa = groups[:36]

In [16]:
def get_cv_results(n_repeats=5, n_splits=10,
                   power_transformer=False, 
                   shuffle=False,
                   ensembler=None,
                   n_estimators=10,
                   trim_nodes=0,
                   square_features=False):
    if shuffle:
        rng = np.random.default_rng()
        y_fit = rng.permutation(y)
    else:
        y_fit = np.copy(y)

    if trim_nodes > 0:
        grp_mask = np.zeros_like(groups_md_fa[0], dtype=bool)
        grp_mask[trim_nodes:-trim_nodes] = True
        X_mask = np.concatenate([grp_mask] * len(groups_md_fa))

        groups_trim = []
        start_idx = 0
        
        for grp in groups_md_fa:
            stop_idx = start_idx + len(grp) - 2 * trim_nodes
            groups_trim.append(np.arange(start_idx, stop_idx))
            start_idx += len(grp) - 2 * trim_nodes
            
        X_trim = X_md_fa[:, X_mask]
    elif trim_nodes == 0:
        groups_trim = [grp for grp in groups_md_fa]
        X_trim = np.copy(X_md_fa)
    else:
        raise ValueError("trim_nodes must be non-negative.")
        
    if square_features:
        _n_samples, _n_features = X_trim.shape
        X_trim = np.hstack([X_trim, np.square(X_trim)])
        groups_trim = [np.concatenate([g, g + _n_features]) for g in groups_trim]

    cv = RepeatedStratifiedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=1729
    )

    cv_results = {}
    pipe_skopt = afqi.pipeline.make_base_afq_pipeline(
        imputer_kwargs={"strategy": "median"},
        power_transformer=power_transformer,
        scaler="standard",
        estimator=LogisticRegressionCV,
        estimator_kwargs={
            "verbose": 0,
            "Cs": 50,
            "penalty": "l1",
            "cv": 3,
            "n_jobs": 10,
            "solver": "saga",
            "max_iter": 500,
        },
        verbose=0,
        ensemble_meta_estimator=ensembler,
        ensemble_meta_estimator_kwargs={
            "n_estimators": n_estimators,
            "n_jobs": 1,
            "oob_score": True,
            "random_state": 1729,
        },
    )
    
    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X_trim, y_fit)):
        start = datetime.now()

        X_train, X_test = X_trim[train_idx], X_trim[test_idx]
        y_train, y_test = y_fit[train_idx], y_fit[test_idx]

        with joblib.parallel_backend("dask"):
            pipe_skopt.fit(X_train, y_train)

        cv_results[cv_idx] = {
            "pipeline": pipe_skopt,
            "train_idx": train_idx,
            "test_idx": test_idx,
            "y_prob": pipe_skopt.predict_proba(X_test)[:, 1],
            "y_pred": pipe_skopt.predict(X_test),
            "y_true": y_test,
            "test_accuracy": accuracy_score(y_test, pipe_skopt.predict(X_test)),
            "train_accuracy": accuracy_score(y_train, pipe_skopt.predict(X_train)),
            "test_roc_auc": roc_auc_score(y_test, pipe_skopt.predict(X_test)),
            "train_roc_auc": roc_auc_score(y_train, pipe_skopt.predict(X_train)),
        }
        if ensembler is not None:
            cv_results[cv_idx]["coefs"] = [
                est.coef_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ]
            cv_results[cv_idx]["C"] = [
                est.C_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ]
        else:
            cv_results[cv_idx]["coefs"] = pipe_skopt.named_steps["estimate"].coef_
            cv_results[cv_idx]["C"] = pipe_skopt.named_steps["estimate"].C_
        
        print(f"CV index [{cv_idx:3d}], Elapsed time: ", datetime.now() - start)
        
    return cv_results, y_fit

In [17]:
results = {}

results[f"pure_lasso_trim0"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=False, shuffle=False,
    trim_nodes=0, square_features=False
)

distributed.scheduler - INFO - Receive client connection: Client-worker-6120f84a-64cb-11eb-b835-73a50fbfcb64
distributed.core - INFO - Starting established connection


CV index [  0], Elapsed time:  0:00:07.851499




CV index [  1], Elapsed time:  0:00:07.810273




CV index [  2], Elapsed time:  0:00:08.824207


distributed.scheduler - INFO - Receive client connection: Client-worker-6fc0b664-64cb-11eb-b83d-42010a8a0002
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-6fbe95d8-64cb-11eb-b83d-42010a8a0002
distributed.core - INFO - Starting established connection


CV index [  3], Elapsed time:  0:00:08.840997




CV index [  4], Elapsed time:  0:00:08.298693
CV index [  5], Elapsed time:  0:00:07.926956


distributed.scheduler - INFO - Receive client connection: Client-worker-7eb04528-64cb-11eb-b838-42010a8a0002
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-7eb01d9e-64cb-11eb-b838-42010a8a0002
distributed.core - INFO - Starting established connection
distributed.scheduler - INFO - Receive client connection: Client-worker-7eb3742c-64cb-11eb-b838-42010a8a0002
distributed.core - INFO - Starting established connection


CV index [  6], Elapsed time:  0:00:09.734925




CV index [  7], Elapsed time:  0:00:07.832069


distributed.scheduler - INFO - Receive client connection: Client-worker-892b3012-64cb-11eb-b839-1b3e903af6f1
distributed.core - INFO - Starting established connection


CV index [  8], Elapsed time:  0:00:08.424053
CV index [  9], Elapsed time:  0:00:08.147017




In [18]:
results.keys()

dict_keys(['pure_lasso_trim0'])

In [19]:
for key, res in results.items():
    test_accuracies = [cvr["test_accuracy"] for cvr in res[0].values()]
    train_accuracies = [cvr["train_accuracy"] for cvr in res[0].values()]
    test_auc = [cvr["test_roc_auc"] for cvr in res[0].values()]
    train_auc = [cvr["train_roc_auc"] for cvr in res[0].values()]
    
    print(key, "test  acc", np.mean(test_accuracies))
    print(key, "train acc", np.mean(train_accuracies))
    print(key, "test  auc", np.mean(test_auc))
    print(key, "train auc", np.mean(train_auc))
    print()

pure_lasso_trim0 test  acc 0.76
pure_lasso_trim0 train acc 0.9767441860465116
pure_lasso_trim0 test  auc 0.7583333333333333
pure_lasso_trim0 train auc 0.9768398268398268



In [20]:
with open("als_classify_pure_lasso.pkl", "wb") as fp:
    pickle.dump(results, fp)