## Predict BMI

In this example, we will constrain the infinite set of different analyses to investigating how a few different analytic choices will vary performance when predicting subjects BMI.

1. Using vertex data directly as input to a ridge regression
2. Apply both a pre-existing parcellation and a randomly generated parcellation to the data to the data, then try two ML models, an elastic-net regression and a SVM.
3. Test ensembles over over randomly generated parcellations.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import BPt as bp
import os
from os.path import dirname, abspath
import numpy as np
from neurotools.transform.rois import SurfLabels

In [2]:
# Useful directories
main_dr = dirname(abspath(os.getcwd()))
data_dr = os.path.join(main_dr, 'data')

# This is optional, but speeds up some
# operations, to ignore, set to None
cache_loc = os.path.join(data_dr, 'cache', 'fs_surf')

# Load in our pre-saved dataset
data = bp.read_pickle(os.path.join(data_dr, 'datasets', 'fs_surf.dataset'))
data

Unnamed: 0_level_0,area,thickness
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
sub-0001,Loc(1484),Loc(556)
sub-0002,Loc(1736),Loc(808)
sub-0003,Loc(1211),Loc(283)
sub-0004,Loc(1748),Loc(820)
sub-0005,Loc(1240),Loc(312)
...,...,...
sub-0924,Loc(1743),Loc(815)
sub-0925,Loc(1372),Loc(444)
sub-0926,Loc(1483),Loc(555)
sub-0927,Loc(1391),Loc(463)

Unnamed: 0_level_0,BIS,BMI,IST_intelligence_total,NEO_N,STAI_T,age,education_level,sex
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sub-0001,16,23.0,159.0,28,44.0,22.00,2,0
sub-0002,20,20.0,199.0,30,31.0,21.75,2,0
sub-0003,20,31.0,227.0,34,40.0,25.25,0,0
sub-0004,22,20.0,270.0,29,32.0,22.50,0,0
sub-0005,20,23.0,212.0,27,23.0,22.25,0,1
...,...,...,...,...,...,...,...,...
sub-0924,22,21.0,246.0,40,56.0,22.25,2,1
sub-0925,13,30.0,150.0,28,44.0,25.25,2,1
sub-0926,20,22.0,161.0,27,30.0,20.75,0,1
sub-0927,23,35.0,190.0,35,38.0,24.25,2,0


Below are some utilities that will make the code easier

In [3]:
def get_standard_search_model(model_str, params=1):
    '''We define our base model here as a model
    with nested random hyper-parameter search
    used to choose across values (params=1 means we are
    using default hyper-parameter e.g., distribution 1,
    see: https://sahahn.github.io/BPt/options/pipeline_options/models.html#ridge-regressor
    '''
    
    random_search = bp.ParamSearch('RandomSearch', n_iter=60)
    return bp.Model(model_str, params=params, param_search=random_search)


def load_parcel(parcel_name):
    '''This function is designed to load and save pre-generated
    parcellations from a different project. We use it here as a helper.'''
    
    # Make sure parcel directory init'ed
    parcel_dr = os.path.join('data', 'parcels', 'fs_LR32k_concat')
    os.makedirs(parcel_dr, exist_ok=True)
    
    # Get location
    parcel_loc = os.path.join(parcel_dr, f'{parcel_name}.npy')
    
    # If doesn't exist, try to download
    if not os.path.exists(parcel_loc):
        parcel_url = f'https://raw.githubusercontent.com/sahahn/parc_scaling/main/parcels/{parcel_name}.npy'
        os.system(f'wget -L {parcel_url} -P {parcel_dr}')
        
    # Load parcel
    parcel = np.load(parcel_loc)
    
    return parcel

In [4]:
# We can save some common evaluation parameters in a special object called ProblemSpec
ps = bp.ProblemSpec(target='BIS', # Select the target variable we want to predict
                    scope='all', # This is set by default, but says use all avaliable features
                    n_jobs=8, # The number of multi-proc jobs to use
                    random_state=1 # A re-usable random state for this expiriment
                   )

## Predict Vertex Level

We will construct a fairly simple machine learning pipeline to try first. The basic idea is that a ridge regression will be fit and evaluated on the vertex values directly.

In [None]:
# This object is responsible for converting from
# saved Data Files, to just a flattened numpy / acceptable
# input to ML model
loader = bp.Loader('identity')

# Next, values with be scaled according to robust scaling,
# i.e., values are scaled according to the IQR instead of min and max
scaler = bp.Scaler('robust', quantile_range=(25, 75))

# Get a ridge regression with hyper-parameter search dist.
model = get_standard_search_model('ridge')

# Lastly, put everything together in a Pipeline
# object, which we can pass to evaluate functions
pipe = bp.Pipeline(steps=[loader, scaler, model])

# Evaluate with default 5 fold CV
vertex_results = bp.evaluate(pipe, data, problem_spec=ps)
vertex_results

Predicting target = BIS
Using problem_type = regression
Using scope = all (defining a total of 2 features).
Evaluating 928 total data points.


Folds:   0%|          | 0/5 [00:00<?, ?it/s]


Training Set: (742, 2)
Validation Set: (186, 2)


## Predict ROI level

Next, we will consider adding a step where values are converted into mean values per ROI. We will prepare special new Loader objects, as well as use another useful feature of BPt which are the bp.Compare and bp.Option objects. This will allow us to test all of our combinations of interest here at once.

In [None]:
from neurotools.transform.rois import SurfLabels

# Load hcp_mmp and random parcel
hcp_mmp = load_parcel(parcel_name='hcp_mmp')
random_500 = load_parcel(parcel_name='random_500_0')

# Put together each option in a special compare wrapper
options = bp.Compare([bp.Option(hcp_mmp, name='hcp_mmp'),
                      bp.Option(random_500, name='random_500_0')])

# Generate an alternate loader from before
sl = SurfLabels(options, vectorize=True)
compare_loaders = bp.Loader(sl, cache_loc=cache_loc)

# Generate same options with model
model_options = bp.Compare(['elastic', 'svm'])
compare_models = model = get_standard_search_model(model_options)

# Put together in pipeline
roi_pipe = bp.Pipeline(steps=[compare_loaders, scaler, compare_models])

# Evaluate
compare_results = bp.evaluate(roi_pipe, data, problem_spec=ps,
                              eval_verbose=0, mute_warnings=True)

# Show a summary
compare_results.summary()

## Ensemble of random parcellations

Last, but not least, we will show an example on how to generate and test an ensemble over different parcellations. For this option, let's use elastic-net's and a ridge regression as the model responsible for averaging predictions.

In [None]:
def get_sub_pipe(parcel_name):
    
    # Load parcel
    parcel = load_parcel(parcel_name=parcel_name)
    
    # Wrap in loader
    loader = bp.Loader(SurfLabels(parcel, vectorize=True),
                       cache_loc=cache_loc)
    
    # Get scaler and model
    scaler = bp.Scaler('robust', quantile_range=(25, 75))
    model = get_standard_search_model('elastic')
    
    # Return as pipeline
    pipe = bp.Pipeline([loader, scaler, model])
    return pipe

### Version 1

In [None]:
# Build each sub ensemble from a different random parcellation with 500 parcels
sub_pipes = [get_sub_pipe(parcel_name=f'random_500_{i}') for i in range(5)]

# Put together ensemble
stacker = get_standard_search_model('elastic')
ensemble = bp.Ensemble('stacking', models=sub_pipes, base_model=stacker)

ensemble_results1 = bp.evaluate(ensemble, data, problem_spec=ps, mute_warnings=True)
ensemble_results1

### Version 2

In [None]:
# Alternatively, we can ensemble over even more parcellations, from even more sizes
# at the expense of additionally processing time.
sub_pipes = [get_sub_pipe(parcel_name=f'random_500_{i}') for i in range(3)]
sub_pipes += [get_sub_pipe(parcel_name=f'random_1000_{i}') for i in range(3)]
sub_pipes += [get_sub_pipe(parcel_name=f'random_2000_{i}') for i in range(3)]
sub_pipes += [get_sub_pipe(parcel_name=f'random_3000_{i}') for i in range(3)]

# Put together ensemble
stacker = get_standard_search_model('elastic')
ensemble = bp.Ensemble('stacking', models=sub_pipes, base_model=stacker)

ensemble_results2 = bp.evaluate(ensemble, data, problem_spec=ps, mute_warnings=True)
ensemble_results2