### This notebook is designed to explore some interesting features avaliable through ABCD_ML, specifically loading Data_Files and using custom loaders.

This example will require some extra optional ABCD_ML libraries, including nibabel and nilearn!

We will also use fake data for this example

In [1]:
from ABCD_ML import *
import nibabel as nib
import numpy as np
import pandas as pd
import os

## Let's start by saving some fake surface time-series data, and also some fake just surface data.

In [2]:
X = np.random.random(size = (20, 100, 10242))
os.makedirs('fake_time_data', exist_ok=True)

for x in range(len(X)):
    np.save('fake_time_data/' + str(x) + '_lh', X[x])
for x in range(len(X)):
    np.save('fake_time_data/' + str(x) + '_rh', X[x])
    
X = np.random.random(size = (20, 10242))
os.makedirs('fake_surf_data', exist_ok=True)

for x in range(len(X)):
    np.save('fake_surf_data/' + str(x) + '_lh', X[x])
for x in range(len(X)):
    np.save('fake_surf_data/' + str(x) + '_rh', X[x])

### In this expiriment we will load both the timeseries and the just surface data

In [3]:
ML = ABCD_ML(log_dr=None, verbose=True)

timeseries_dr = 'fake_time_data/'
files = os.listdir(timeseries_dr)
lh_timeseries = [timeseries_dr + f for f in files if '_lh' in f]
rh_timeseries = [timeseries_dr + f for f in files if '_rh' in f]

surf_dr = 'fake_surf_data/'
files = os.listdir(surf_dr)
lh_surf = [surf_dr + f for f in files if '_lh' in f]
rh_surf = [surf_dr + f for f in files if '_rh' in f]

subjects = [str(i) for i in range(20)]

df = pd.DataFrame()

df['lh_timeseries'] = lh_timeseries
df['rh_timeseries'] = rh_timeseries

df['lh_surf'] = lh_surf
df['rh_surf'] = rh_surf

df['src_subject_id'] = subjects
df['target'] = np.random.randint(2, size=20)


ML.Load_Data_Files(df = df,
                   load_func = np.load,
                   drop_keys = ['target'])

ML.Load_Targets(df = df, col_name='target', data_type='b')
ML.Load_Strat(df = df, col_name='target')

ML.Train_Test_Split(test_size=0)

exp_name = My_ML_Exp
log_dr = None
existing_log = append
verbose = True
exp log dr setup at: None
log file at: None
Default params set:
notebook = True
use_abcd_subject_ids = False
low memory mode = False
strat_u_name = _Strat
random state = 534
n_jobs = 1
dpi = 100
mp_context = spawn
ABCD_ML object initialized
Setting default load params, as they have not been set!

Default load params set within self.default_load_params.
----------------------
dataset_type: basic
subject_id: src_subject_id
eventname: None
eventname_col: eventname
overlap_subjects: False
merge: inner
na_values: ['777', '999']
drop_na: True
drop_or_na: drop

To change the default load params, call self.Set_Default_Load_Params()

Loading from df or files

Dropped 1 columns per passed drop_keys argument
Loading from df or files
Dropped 0 cols for all missing values
Dropped 0 rows for missing values, based on the provided drop_na param: True with actual na_thresh: 0
Loaded rows with NaN remaining: 0

loading: target

Load

### This will assume you have some saved parcellations in the relevant space, i.e., we saved fake fsaverage5 surface data, so we will load in the desikan parcellations

In [3]:
from ABCD_ML.extensions import SurfLabels, Connectivity, Networks

base = '/home/sage/work/Parcel_Search/Existing_Parcels/'
desikan_lh = base + 'lh.aparc.annot'
desikan_rh = base + 'rh.aparc.annot'

t_surf_rois_lh = SurfLabels(labels = desikan_lh,
                            vectorize = False)
t_surf_rois_rh = SurfLabels(labels = desikan_rh,
                            vectorize = False)

connectivity = Connectivity(vectorize=True, discard_diagonal=False)

surf_rois_lh = SurfLabels(labels = desikan_lh)
surf_rois_rh = SurfLabels(labels = desikan_rh)

### Try just loading the left hemisphere surface data

In [None]:
problem_spec = Problem_Spec(problem_type = 'binary',
                            scope = 'lh_surf')

loaders = Loader(surf_rois_lh, scope='lh_surf')

model_pipeline = Model_Pipeline(loaders=loaders,
                                feat_selectors=Feat_Selector('univariate selection'),
                                feat_importances=Feat_Importance('shap'))

results = ML.Evaluate(model_pipeline, problem_spec)

for step in ML.Model_Pipeline.Model.steps:
    print(step)

In [None]:
results['FIs'][0]

In [None]:
ML.Plot_Global_Feat_Importances()

In [None]:
problem_spec = Problem_Spec(problem_type = 'binary',
                            scope = '_surf')

loaders = [Loader(surf_rois_lh, scope='lh_surf'),
           Loader(surf_rois_rh, scope='rh_surf')]

model_pipeline = Model_Pipeline(loaders=loaders,
                                feat_importances=Feat_Importance('shap'))

results = ML.Evaluate(model_pipeline, problem_spec)

for step in ML.Model_Pipeline.Model.steps:
    print(step)

In [None]:
results['FIs'][0].inverse_global_fis

### Load everything

In [None]:
problem_spec.scope = 'all'

loaders = [Loader(surf_rois_lh, scope='lh_surf'),
           Loader(surf_rois_rh, scope='rh_surf'),
           Loader(Pipe([t_surf_rois_lh, connectivity]), scope='lh_timeseries'),
           Loader(Pipe([t_surf_rois_rh, connectivity]), scope='rh_timeseries')]

model_pipeline = Model_Pipeline(loaders=loaders)
results = ML.Evaluate(model_pipeline, problem_spec)

for step in ML.Model_Pipeline.Model.steps:
    print(step)

In [None]:
problem_spec.scope = 'timeseries'

loaders = [Loader(surf_rois_lh, scope='lh_timeseries'),
           Loader(surf_rois_rh, scope='rh_timeseries')]

model_pipeline = Model_Pipeline(loaders=loaders)
results = ML.Evaluate(model_pipeline, problem_spec)

for step in ML.Model_Pipeline.Model.steps:
    print(step)

 Networks class

In [5]:
from ABCD_ML.extensions import SurfLabels, Connectivity, Networks
def my_load_func(loc):
    data = pd.read_csv(loc, sep='\t', header=None)
    data=data.drop(data.columns[0], axis=1)
    return np.array(data)

data_loc = '/home/sage/Downloads/TS/'

def file_to_subject_func(file):
    subject = file.split('/')[-1].split('_')[0]
    return subject

ML = ABCD_ML(log_dr=None, verbose=False)

files = {'run1': [os.path.join(data_loc, f) for f in os.listdir(data_loc) if '_01.txt' in f]}

file_to_subject = file_to_subject_func

ML.Load_Data_Files(files = files,
                   file_to_subject = file_to_subject,
                   clear_existing=True,
                   load_func=my_load_func)

copy = ML.data.copy()
copy['target'] = np.random.random(len(ML.data))

ML.Load_Targets(df=copy,
                col_name=['target'],
                data_type='f',
                clear_existing=True)

ML.Train_Test_Split(test_size=0.1)

In [23]:




problem_spec = Problem_Spec(problem_type = 'regression',
                            n_jobs=16, random_state=2)

connectivity = Connectivity(vectorize=False, kind='correlation', discard_diagonal=True)

nets = Networks(threshold=.1,
                threshold_method='density',
                to_compute=['avg_degree','avg_cluster'])


#loaders = Select(loader_list)

nets_params= {'threshold':ng.p.Choice([.2,.3])}

loader4 = Loader(obj = Pipe([connectivity, nets]),
                 params = [0, 0])

#loaders = [Loader(Pipe([connectivity,reshape,nets]))]

dt= Model('dt regressor')

model_pipeline = Model_Pipeline(loaders=loader4,
                                scalers=None,
                                model=dt)
                               

results = ML.Evaluate(model_pipeline, problem_spec, n_repeats=1, splits=2)

HBox(children=(FloatProgress(value=0.0, description='Repeats', max=1.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=2.0, style=ProgressStyle(description_width='i…





In [None]:
repr()

Weird test stuff

In [None]:
from sklearn.base import clone
cloned_transformer = clone(ML.Model_Pipeline.Model[0])

In [None]:
q = cloned_transformer.file_mapping.copy()

In [None]:
def test(cloned_transformer, X, y):
    
    qq = cloned_transformer.fit_transform(X, y)
    
    return qq, cloned_transformer

test_cached = memory.cache(test)

In [None]:
test_cached._get_argument_hash(cloned_transformer.file_mapping)

In [None]:
from joblib.hashing import hash

In [None]:
r = [q[0], q[1]]

In [None]:
z = [q[1], q[0]]

In [None]:
hash(r), hash(z)

In [None]:
test_cached(cloned_transformer, X, y)

In [None]:
memory = check_memory('/home/sage/temp')
#fit_transform_one_cached = memory.cache(_fit_transform_one)

In [None]:
X = np.array(ML.all_data['run1'])
y = np.array(ML.all_data['target'])

X = X.reshape((52, 1))

In [None]:
X, fitted_transformer = fit_transform_one_cached(
                cloned_transformer, X, y, None)

In [None]:
from sklearn.utils import _print_elapsed_time
from sklearn.base import clone
from distutils.version import LooseVersion
import joblib

def _fit_transform_one(transformer,
                       X,
                       y,
                       weight,
                       message_clsname='',
                       message=None,
                       **fit_params):
    """
    Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
    with the fitted transformer. If ``weight`` is not ``None``, the result will
    be multiplied by ``weight``.
    """
    
    from sklearn.utils import _print_elapsed_time
    
    with _print_elapsed_time(message_clsname, message):
        if hasattr(transformer, 'fit_transform'):
            res = transformer.fit_transform(X, y, **fit_params)
        else:
            res = transformer.fit(X, y, **fit_params).transform(X)

    if weight is None:
        return res, transformer
    return res * weight, transformer

def _fit_transform_one2(transformer,
                       X,
                       y,
                       weight,
                       message_clsname='',
                       message=None,
                       **fit_params):
    

    if hasattr(transformer, 'fit_transform'):
        res = transformer.fit_transform(X, y, **fit_params)
    else:
        res = transformer.fit(X, y, **fit_params).transform(X)

    if weight is None:
        return res, transformer
    return res * weight, transformer


def check_memory(memory):
    """Check that ``memory`` is joblib.Memory-like.
    joblib.Memory-like means that ``memory`` can be converted into a
    joblib.Memory instance (typically a str denoting the ``location``)
    or has the same interface (has a ``cache`` method).
    Parameters
    ----------
    memory : None, str or object with the joblib.Memory interface
    Returns
    -------
    memory : object with the joblib.Memory interface
    Raises
    ------
    ValueError
        If ``memory`` is not joblib.Memory-like.
    """

    if memory is None or isinstance(memory, str):
        if LooseVersion(joblib.__version__) < '0.12':
            memory = joblib.Memory(cachedir=memory, verbose=10)
        else:
            memory = joblib.Memory(location=memory, verbose=10)
    elif not hasattr(memory, 'cache'):
        raise ValueError("'memory' should be None, a string or have the same"
                         " interface as joblib.Memory."
                         " Got memory='{}' instead.".format(memory))
    return memory