In [1]:
# from dask.distributed import Client, LocalCluster
# import logging

# cluster = LocalCluster(
#     n_workers=28,
#     threads_per_worker=8,
#     silence_logs=logging.DEBUG
# )

# client = Client(cluster, heartbeat_interval=10000)
# print(client.dashboard_link)

In [2]:
# cluster.scheduler_address

In [3]:
import afqinsight as afqi
import groupyr as gpr
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os.path as op
import pandas as pd
import pickle
import seaborn as sns

from groupyr.transform import GroupRemover, GroupExtractor
from groupyr.decomposition import GroupPCA

from datetime import datetime

from sklearn.base import clone
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score, roc_auc_score

from skopt import BayesSearchCV
from skopt.plots import plot_convergence, plot_objective, plot_evaluations

print(afqi.__version__)
print(gpr.__version__)

0.2.9.dev2565515056
0.2.4.dev561125696


In [4]:
X, y, groups, columns, group_names, subjects, classes = afqi.load_afq_data(
    "../data/raw/als_data",
    target_cols=["class"],
    label_encode_cols=["class"],
)

In [5]:
label_sets = afqi.multicol2sets(pd.MultiIndex.from_tuples(columns, names=["metric", "tractID", "nodeID"]))

In [6]:
pyafq_bundles = [
    c for c in columns
    if c[1] not in ["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"]
]
pyafq_bundles = [
    [c] for c in np.unique([col[1] for col in pyafq_bundles])
]

In [7]:
gr = GroupRemover(
    select=["Right Cingulum Hippocampus", "Left Cingulum Hippocampus"],
    groups=groups,
    group_names=group_names,
)
X_pyafq_bundles = gr.fit_transform(X)

In [8]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(len(label_sets))

(48, 16000)
(48, 14400)
16000


In [9]:
group_names = [
    grp for grp in group_names
    if "Cingulum Hippocampus" not in grp[1]
]

In [10]:
ge = GroupExtractor(
    select=["fa", "md"],
    groups=groups[:144],
    group_names=group_names
)
X_md_fa = ge.fit_transform(X_pyafq_bundles)

In [11]:
print(X.shape)
print(X_pyafq_bundles.shape)
print(X_md_fa.shape)

(48, 16000)
(48, 14400)
(48, 3600)


In [12]:
groups_md_fa = groups[:36]

In [13]:
def get_cv_results(n_repeats=5, n_splits=10,
                   power_transformer=False, 
                   shuffle=False,
                   ensembler=None,
                   n_estimators=10):
    if shuffle:
        rng = np.random.default_rng()
        y_fit = rng.permutation(y)
    else:
        y_fit = np.copy(y)

    X_trim = np.copy(X_md_fa)
        
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=1729
    )

    imputer = SimpleImputer(strategy="median")
    gpca = GroupPCA(groups=groups_md_fa)
    
    cv_results = {}

    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(X_trim, y_fit)):
        start = datetime.now()

        X_train, X_test = X_trim[train_idx], X_trim[test_idx]
        y_train, y_test = y_fit[train_idx], y_fit[test_idx]

        groups_pca = gpca.fit(imputer.fit_transform(X_train)).groups_out_
        
        pipe_skopt = afqi.make_afq_classifier_pipeline(
            imputer_kwargs={"strategy": "median"},
            use_cv_estimator=True,
            power_transformer=power_transformer,
            scaler="standard",
            groups=groups_pca,
            verbose=0,
            pipeline_verbosity=False,
            tuning_strategy="bayes",
            cv=3,
            n_bayes_points=9,
            n_jobs=28,
            l1_ratio=[0.0, 1.0],
            eps=5e-2,
            n_alphas=100,
            power_transformer_kwargs={
                "groups": groups_md_fa,
            },
            ensemble_meta_estimator=ensembler,
            ensemble_meta_estimator_kwargs={
                "n_estimators": n_estimators,
                "n_jobs": 1,
                "oob_score": True,
                "random_state": 1729,
            },
        )
        
        pipe_skopt.fit(X_train, y_train)

        cv_results[cv_idx] = {
            "pipeline": pipe_skopt,
            "train_idx": train_idx,
            "test_idx": test_idx,
            "y_prob": pipe_skopt.predict_proba(X_test)[:, 1],
            "y_pred": pipe_skopt.predict(X_test),
            "y_true": y_test,
            "test_accuracy": accuracy_score(y_test, pipe_skopt.predict(X_test)),
            "train_accuracy": accuracy_score(y_train, pipe_skopt.predict(X_train)),
            "test_roc_auc": roc_auc_score(y_test, pipe_skopt.predict(X_test)),
            "train_roc_auc": roc_auc_score(y_train, pipe_skopt.predict(X_train)),
            "coefs": [
                est.coef_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ],
            "pca_components": pipe_skopt.named_steps["power_transform"].components_,
            "alpha": [
                est.alpha_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ],
            "l1_ratio": [
                est.l1_ratio_ for est
                in pipe_skopt.named_steps["estimate"].estimators_
            ],
        }
        
        if ensembler is None:
            cv_results[cv_idx]["optimizer"] = pipe_skopt.named_steps["estimate"].bayes_optimizer_

        print(f"CV index [{cv_idx:3d}], Elapsed time: ", datetime.now() - start)
        
    return cv_results, y_fit

In [14]:
results = {}

results[f"bagging_stratify_group_pca"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=GroupPCA,
    ensembler="serial-bagging", shuffle=False,
    n_estimators=20,
)

CV index [  0], Elapsed time:  0:11:02.836631
CV index [  1], Elapsed time:  0:12:20.902078
CV index [  2], Elapsed time:  0:12:06.044455
CV index [  3], Elapsed time:  0:12:14.146831
CV index [  4], Elapsed time:  0:11:35.694972
CV index [  5], Elapsed time:  0:11:41.146706
CV index [  6], Elapsed time:  0:11:31.815093
CV index [  7], Elapsed time:  0:11:26.111120
CV index [  8], Elapsed time:  0:11:36.637716
CV index [  9], Elapsed time:  0:11:37.123915


In [15]:
results.keys()

dict_keys(['bagging_stratify_group_pca'])

In [16]:
for key, res in results.items():
    test_accuracies = [cvr["test_accuracy"] for cvr in res[0].values()]
    train_accuracies = [cvr["train_accuracy"] for cvr in res[0].values()]
    test_auc = [cvr["test_roc_auc"] for cvr in res[0].values()]
    train_auc = [cvr["train_roc_auc"] for cvr in res[0].values()]
    
    print(key, "test  acc", np.mean(test_accuracies))
    print(key, "train acc", np.mean(train_accuracies))
    print(key, "test  auc", np.mean(test_auc))
    print(key, "train auc", np.mean(train_auc))
    print()

bagging_stratify_group_pca test  acc 0.8800000000000001
bagging_stratify_group_pca train acc 0.9884249471458773
bagging_stratify_group_pca test  auc 0.9
bagging_stratify_group_pca train auc 0.9885281385281385



In [17]:
with open("als_classify_group_pca.pkl", "wb") as fp:
    pickle.dump(results, fp)

In [None]:
import itertools

def mean_over_combinations(results):
    length = len(results)
    mean_results = {}
    for r in range(1, length + 1):
        mean_results[r] = [
            np.mean([res["yhat"].values for res in comb], axis=0)
            for comb in itertools.combinations(results, r=r)
        ]
        
    return mean_results

def accuracy_over_combinations(results):
    mean_results = mean_over_combinations(results)
    mean_accuracies = []
    for r in mean_results.keys():
        mean_accuracies += [
            {
                "n_repeats": r,
                "accuracy": accuracy_score(results[0]["class"].values, res > 0.5)
            } for res in mean_results[r]
        ]
        
    return pd.DataFrame(mean_accuracies)

def auc_over_combinations(results):
    mean_results = mean_over_combinations(results)
    mean_auc = []
    for r in mean_results.keys():
        mean_auc += [
            {
                "n_repeats": r,
                "auc": roc_auc_score(results[0]["class"].values, res)
            } for res in mean_results[r]
        ]
        
    return pd.DataFrame(mean_auc)

def get_accuracy_ensemble_dataframe(cv_results, y_true):    
    test_probs = {
        idx: pd.Series(
            data=cvr["y_prob"],
            index=cvr["test_idx"],
            name="yhat"
        )
        for idx, cvr in cv_results.items()
    }
    
    df_ytest = {
        idx: pd.DataFrame(test_probs[idx]).merge(
            pd.DataFrame(y, columns=["class"]),
            left_index=True,
            right_index=True,
            how="left"
        ) for idx in test_probs.keys()
    }
    
    acc_scores = [
        accuracy_score(_df["class"].values, _df["yhat"].values > 0.5)
        for _df in df_ytest.values()
    ]
    
    repeats = [
        pd.concat([df_ytest[i] for i in range(x * 10, (x + 1) * 10)]).sort_index()
        for x in range(len(cv_results) // 10)
    ]
    
    return accuracy_over_combinations(repeats)

In [None]:
df_accuracies = {
    key: get_accuracy_ensemble_dataframe(res[0], y)
    for key, res in results.items()
}

In [None]:
sns.catplot(x="n_repeats", y="accuracy", data=df_accuracies["no_power_transform"])

In [None]:
sns.catplot(x="n_repeats", y="accuracy", data=df_accuracies["shuffle_no_transform"])

In [None]:
results["bagging_stratify_clone_estimators"] = get_cv_results(
    n_splits=10, n_repeats=1, power_transformer=False,
    ensembler="serial-bagging", shuffle=False,
    n_estimators=20
)
# results["bagging_shuffle_stratify"] = get_cv_results(
#     n_splits=10, n_repeats=1, power_transformer=False,
#     ensembler="serial-bagging", shuffle=True
# )

In [None]:
for key, res in results.items():
    test_accuracies = [cvr["test_accuracy"] for cvr in res[0].values()]
    train_accuracies = [cvr["train_accuracy"] for cvr in res[0].values()]
    print(key, "test", np.mean(test_accuracies))
    print(key, "train", np.mean(train_accuracies))

In [None]:
with open("als_classify.pkl", "wb") as fp:
    pickle.dump(results, fp)

In [None]:
bagging_results = results["bagging_stratify"][0]

In [None]:
bagging_results[0].keys()

In [None]:
nested_lists = [[
    est.coef_ for est in
    bag_res["pipeline"].named_steps["estimate"].estimators_
] for bag_res in bagging_results.values()]

coefs = [y for x in nested_lists for y in x]

In [None]:
np.array(coefs).shape

In [None]:
columns_md_fa = [
    c for c in columns
    if "fa" in c or "md" in c
]

In [None]:
from dipy.viz import window, actor, panel
from dipy.data import fetch_bundles_2_subjects, read_bundles_2_subjects
from dipy.tracking.streamline import transform_streamlines
from dipy.viz import colormap
from dipy.viz import ui

import json

In [None]:
df_bag = pd.concat([
    pd.DataFrame.from_dict(
        {
            k: v for k, v in bag_res.items()
            if k in ["test_idx", "y_prob", "y_true"]
        }
    ).set_index(keys="test_idx", drop=True)
    for bag_res in bagging_results.values()
]).sort_index()
df_bag.head()

In [None]:
df_bag["Ground truth"] = df_bag["y_true"].map({0: "Control", 1: "ALS"})
df_bag["y_pred"] = (df_bag["y_prob"] > 0.5).astype(int)
df_bag["Prediction"] = df_bag["y_pred"].map(
    {0: "Predicted control", 1: "Predicted ALS"}
)
df_bag.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
sns.swarmplot(
    y="y_prob",
    x="Ground truth",
    hue="Prediction",
    data=df_bag,
    ax=ax,
    s=18
)

ax.set_ylabel("Classification probabilities", fontsize=18)
ax.set_xlabel("Ground truth", fontsize=18)
ax.legend(fontsize=18, markerscale=2)
ax.tick_params(axis = 'both', which = 'major', labelsize = 16)
ax.tick_params(axis = 'both', which = 'minor', labelsize = 12)
ax.axhline(0.5, ls="--", color="black")
print(accuracy_score(df_bag["y_true"], df_bag["y_pred"]))