### Regularized (banded) CV regression workflow for Neuroscout
- Multivariate workflows in `Himalaya`  (& nilearn?)
- Input needed from the user
    - Define datasets (independent model fitting for all datasets)
    - Define cross-validation strategy
    - Define estimator
    - Define preprocessing steps
    - Pass parameters
    - Output: scores, parameters, predicted time series
- Implementing CV at the subject level for now
- Define outputs

In [1]:
!pip install himalaya

You should consider upgrading via the '/Users/rr48396/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import pyns
import pandas as pd
import nibabel as nib
import numpy as np
import glob
from copy import deepcopy

In [4]:
api = pyns.Neuroscout()

### Get fMRI data from datalad
Let's retrieve data for a couple of subjects from Budapest.
Neuroscout dataset can be found under the `neuroscout-datasets` organization: https://github.com/neuroscout-datasets

In [5]:
!datalad install https://github.com/neuroscout-datasets/ds003017.git

[[1;31mERROR  [0m] target path already exists and not empty, refuse to clone into target path [install(/Users/rr48396/scratch/ds003017)] 
[1;1minstall[0m([1;31merror[0m): /Users/rr48396/scratch/ds003017 ([1;35mdataset[0m) [target path already exists and not empty, refuse to clone into target path]
[0m

In [6]:
%%capture
!datalad get ds003017/fmriprep/sub-sid000005/func/*-preproc_bold.nii.gz
!datalad get ds003017/fmriprep/sub-sid000007/func/*-preproc_bold.nii.gz
!datalad get ds003017/fmriprep/sub-sid000009/func/*-preproc_bold.nii.gz

In [7]:
TR = 1

In [12]:
def _make_outputs(sub_id='sid000005', TR=1):
    arrays = []
    fs = glob.glob(f'ds003017/fmriprep/sub-{sub_id}/func/*-preproc_bold.nii.gz') # ds003017, remove subsetting
    runs = sorted(set([f.split('/')[-1].split('_')[2].split('-')[1] for f in fs]))
    resampling_ts = []
    for r_ix, r in enumerate(runs):
        img_file = [f for f in fs if f'run-{r}' in f][0]
        data = np.asanyarray(nib.load(img_file).dataobj)
        run_y = data.reshape([data.shape[0] * data.shape[1] * data.shape[2], data.shape[3]]).T
        arrays.append(run_y)
        resampling_ts.append(np.arange(0, TR*run_y.shape[0], step=TR))
    return np.vstack(arrays), resampling_ts

In [13]:
Y, resampling_ts = _make_outputs(TR=TR)

### Build input matrix
Let's retrieve predictor events for multiple sets of predictors. \
For now, let's pick three sets: <b>Audioset</b> + <b>MFCC</b> + <b>mel</b> features (plus some confounds).

In [19]:
audioset = ['as-Music','as-Animal',
            'as-Whistling',
            'as-Vehicle',
            'as-Wild animals',
            'as-Thunderstorm',
            'as-Noise',
            'as-Fire',
            'as-Water',
            'as-Wind',
            'as-Glass',
            'as-Wood',
            'as-Silence',
            'as-Mechanisms',
            'as-Alarm',
            'as-Hands',
            'as-Tools',
            'as-Speech',
            'as-Explosion',
            'as-Engine',
            'as-Liquid',
            'as-Musical instrument']
mfccs = [f'mfcc_{i}' for i in range(20)]
mel = [f'mel_{i}' for i in range(64)]
confounds = ['rot_x', 'rot_y', 'rot_z', 'trans_x', 'trans_y', 'trans_z',
             'a_comp_cor_00', 'a_comp_cor_01', 'a_comp_cor_02',
             'a_comp_cor_03','a_comp_cor_04','a_comp_cor_05']

In [173]:
def _resample(df, timestamps, run_id):
    ''' Naive resampling (averaging between TRs) '''
    out_lst = []
    for i in range(len(timestamps)):
        sub_df = df[(df['onset']>=timestamps[i])]
        if timestamps[i] != timestamps[-1]:
            sub_df = sub_df[sub_df['onset']<timestamps[i+1]]
            sub_df = sub_df.drop(['duration', 'onset'], axis=1).groupby('run_id').agg('mean').reset_index()
        if sub_df.shape[0] == 0:
            sub_df = deepcopy(df)
            for c in sub_df.columns:
                sub_df[c] = np.nan
            sub_df = pd.DataFrame(sub_df.iloc[0,:]).T
            sub_df = sub_df.drop(['duration', 'onset'], axis=1)
        sub_df['TR'] = i
        sub_df['run_id'] = run_id
        out_lst.append(sub_df)
    resampled = pd.concat(out_lst, ignore_index=True)
    return resampled

In [188]:
def _make_input_matrices(predictor_sets, resampling_ts, outs, sub_id='sid000005'):
    # Get run_ids
    ds = api.datasets.get()
    dataset_id = [d['id'] for d in ds if d['name']=='Budapest'][0]
    all_runs = api.runs.get()
    subj_runs = [r for r in all_runs 
                 if (r['dataset_id']==dataset_id) and (r['subject']==sub_id)]
    subj_runs = sorted(subj_runs, key=lambda d: d['number'])
    run_ids = [r['id'] for r in subj_runs]
    
    # Retrieve data
    output_matrices = []
    
    # Return one matrix per predictor set
    for p_ix, ps in enumerate(predictor_sets):
        print(f'Extracting predictor set {p_ix}')
        # For each predictor
        for i, p_name in enumerate(ps):
            event_list = [] 
            # ... and each run
            for r_ix, r in enumerate(run_ids):
                pid = [p['id'] for p in api.predictors.get(run_id=r) 
                       if p['name']==p_name][0]
                run_events = api.predictor_events.get(run_id=r, predictor_id=pid)
                run_events_df = pd.DataFrame(run_events)
                for c in run_events_df.columns:
                    run_events_df[c] = run_events_df[c].astype(float)
                # Sort by onset, remove n/a and rename columns
                run_events_df.sort_values(by='onset', inplace=True)
                run_events_df.replace({'n/a': np.nan}, inplace=True)
                run_events_df.rename({'value': p_name}, axis=1, inplace=True)
                run_events_df.drop('predictor_id', axis=1, inplace=True)
                run_events_df = _resample(run_events_df, resampling_ts[r_ix], r)
                event_list.append(run_events_df)
            # Stack data from multiple runs
            event_df = pd.concat(event_list)
            if i == 0:
                set_df = event_df.copy()
            else:
                # merge data from multiple predictors
                set_df = set_df.merge(event_df, on=['TR', 'run_id'], how='outer')
        # assert set_df.shape[0] == outs.shape[0]
        run_index = set_df['run_id'].values.tolist()
        set_df.drop(['TR', 'run_id'], axis=1, inplace=True)
        output_matrices.append(set_df)
    return output_matrices, run_index

In [None]:
mats, idx = _make_input_matrices(predictor_sets=[mfccs[:2], confounds[:2]], # audioset, mel
                                 resampling_ts=resampling_ts, 
                                 outs=Y)

`mats` is a list of input matrices (one per band); `idx` is the run_index

In [191]:
print('input matrices (per predictor band)')
print(mats)
print('run indices')
print(idx)

input matrices (per predictor band)
[      mfcc_0  mfcc_1
0        NaN     NaN
1        NaN     NaN
2        NaN     NaN
3        NaN     NaN
4        NaN     NaN
...      ...     ...
3047     NaN     NaN
3048     NaN     NaN
3049     NaN     NaN
3050     NaN     NaN
3051     NaN     NaN

[3052 rows x 2 columns],       stimulus_id_x     rot_x  duration_x  onset_x  stimulus_id_y     rot_y  \
0               NaN  0.000118         NaN      NaN            NaN  0.000053   
1               NaN  0.000000         NaN      NaN            NaN  0.000000   
2               NaN -0.000190         NaN      NaN            NaN  0.000000   
3               NaN -0.000425         NaN      NaN            NaN  0.000000   
4               NaN  0.000213         NaN      NaN            NaN -0.000225   
...             ...       ...         ...      ...            ...       ...   
3047            NaN  0.008027         NaN      NaN            NaN -0.003552   
3048            NaN  0.008298         NaN      NaN   

### Preprocessing and model fitting

In [None]:
###

### Handling outputs

In [139]:
### 

### Validate against other workflows

In [None]:
### 