## Data preparation for encoding/decoding models

For this example, we will use the data from Haxby et al., 2001., which are shared via OpenNeuro:

https://openneuro.org/datasets/ds000105/versions/3.0.0

The data are formatted according to the BIDS standard: https://bids-specification.readthedocs.io/en/stable/index.html

First, import required dependencies. You can install these using `pip install -r requirements.txt` from the main repo directory.

In [1]:
%load_ext autoreload
%autoreload 2

import os
import nilearn
from nilearn import datasets, plotting
from nilearn.image import load_img, mean_img, resample_img
from nilearn.maskers import NiftiMasker
from nilearn.glm.first_level import FirstLevelModel
from nilearn.glm.second_level import SecondLevelModel
from nilearn.plotting import plot_stat_map, plot_design_matrix
import h5py
import numpy as np
import nibabel as nib
import datalad.api as dl
from bids import BIDSLayout
from nilearn.glm.first_level import make_first_level_design_matrix
import pandas as pd
import matplotlib.pyplot as plt
from templateflow import api as tflow
import templateflow
from utils import (get_difumo_mask, 
                   get_subject_common_brain_mask,
                   get_group_common_mask,
                   get_subject_runs,
                   get_layouts)

#### Get the data using datalad

We will use a tool called [Datalad](https://www.datalad.org/) to obtain the data from openneuro. 

We will download the raw data, as well as the processed data (using [fMRIPrep](https://fmriprep.org/en/stable/).  Note that downloading these derivative data can take quite a while depending on the speed of one's connection.  

In [3]:
data_dir = "/Users/poldrack/data_unsynced/ds000105"
assert os.path.exists(data_dir), "Data directory not found: %s" % data_dir
fmriprep_dir = os.path.join(data_dir, 'derivatives', 'fmriprep')

# get the raw data
ds = dl.clone(
    path=data_dir,
    source="https://github.com/OpenNeuroDatasets/ds000105.git",
)
dl.get(dataset=data_dir, recursive=True)

get_fmriprep = False  #set to false after downloading fmriprep once

# get the preprocessed derivatives - this takes some time!
if get_fmriprep:
    dl.clone(
        path=fmriprep_dir,
        source='https://github.com/OpenNeuroDerivatives/ds000105-fmriprep.git')
    dl.get(dataset=fmriprep_dir, recursive=True)

[INFO] Ensuring presence of Dataset(/Users/poldrack/data_unsynced/ds000105) to get /Users/poldrack/data_unsynced/ds000105 


### Query the dataset using PyBIDS

Because the dataset is organized using the BIDS standard, we can use the [PyBIDS](https://bids-standard.github.io/pybids/) tool to query the dataset and obtain useful metadata.


In [4]:
# load the dataset using pybids and get runs for each subject


layout, deriv_layout = get_layouts(data_dir, fmriprep_dir)


Example contents of 'dataset_description.json':
{"Name": "Example dataset", "BIDSVersion": "1.0.2", "GeneratedBy": [{"Name": "Example pipeline"}]}


### Create common mask for each subject

Each run will have slightly different voxels included in its brain mask, but we want to have a common mask across all runs, so we will generate a mask that includes the intersection of masks across all of the individual subs/runs.

In [5]:
group_mask = get_group_common_mask(layout)

### Confound regression

Use the outputs from fMRIPrep to generate a denoised version of the data.



In [23]:
def run_confound_regression(layout, deriv_layout, data_dir, overwrite=False):
    cleaned_images = {}

    subjects = [int(sub) for sub in layout.get_subjects()]
    for subject in subjects:
        cleaned_images[subject] = {}
        runs = get_subject_runs(subject, data_dir)
        print(f'Subject {subject} has {len(runs)} runs')
        mask_img = get_subject_common_brain_mask(subject, data_dir)
        t_r = None
        for run in runs:
            preproc_file = deriv_layout.get(subject=subject, run=run, 
                                            desc='preproc', space='MNI152NLin2009cAsym',
                                            suffix='bold', extension='nii.gz', 
                                            return_type='file')
            cleaned_img_file = preproc_file[0].replace('preproc','cleaned')
            if t_r is None:
                t_r = deriv_layout.get_metadata(preproc_file[0])['RepetitionTime']
            else:
                assert t_r == deriv_layout.get_metadata(preproc_file[0])['RepetitionTime']
            assert t_r is not None
            if os.path.exists(cleaned_img_file) and not overwrite:
                #print(f"Using existing cleaned file for subject {subject} run {run}")
                cleaned_img = nib.load(cleaned_img_file)
                cleaned_images[subject][run] =  (cleaned_img_file, cleaned_img)
                continue
            preproc_img = nib.load(preproc_file[0])
            assert len(preproc_file) == 1, f"Found {len(preproc_file)} preproc files for subject {subject} run {run}"
            confound_file = deriv_layout.get(subject=subject, run=run, 
                                            desc='confounds', 
                                            suffix='timeseries', extension='tsv', 
                                            return_type='file')
            assert len(confound_file) == 1, f"Found {len(confound_file)} confound files for subject {subject} run {run}"
            confounds = pd.read_csv(confound_file[0], sep='\t').bfill()
            # need to include cosine with acompcor
            confound_prefixes = ['trans_', 'rot_', 'a_comp_cor_', 'cosine_']
            confound_cols = [c for c in list(confounds.columns) if any([c.startswith(p) for p in confound_prefixes])]
            confounds_selected = confounds[confound_cols]
            cleaned_img = nilearn.image.clean_img(preproc_img,
                                    confounds=confounds_selected,
                                    t_r=t_r,mask_img=mask_img)
            assert cleaned_img_file != preproc_file[0]
            cleaned_img.to_filename(os.path.join(cleaned_img_file))
            cleaned_images[subject][run] = (cleaned_img_file, cleaned_img)
    return cleaned_images, t_r

cleaned_images, t_r = run_confound_regression(
    layout, deriv_layout, data_dir, overwrite=False)

                              

Subject 1 has 12 runs
Subject 2 has 12 runs
Subject 3 has 12 runs
Subject 4 has 12 runs
Subject 5 has 11 runs
Subject 6 has 12 runs


### select task block timepoints

drop the first two TRs from each task block, and generate task labels for each timepoint

In [98]:
# TO DELETE?

n_timepoints = preproc_img.shape[-1]
timepoints = np.arange(0, n_timepoints * t_r, t_r)

# find task onsets in the events file
n_trials_to_skip = 2 # skip 2 trials i.e. 4 seconds
blocklen = 20 # block length in seconds after removing first 4 seconds

if 1:
    conditions = events.trial_type.unique().tolist()
    conditions.sort()
    onsets = {}
    for condition in conditions:
        match_df = events[events.trial_type == condition]
        onsets[condition] = match_df.onset.tolist()[n_trials_to_skip]     
    cond_df = pd.DataFrame({'timepoint': timepoints, 'condition': None})
    for idx in cond_df.index:
        for condition in conditions:
            if cond_df.loc[idx, 'timepoint'] >= onsets[condition] and cond_df.loc[idx, 'timepoint'] < (onsets[condition] + blocklen):
                cond_df.loc[idx, 'condition'] = condition
    for cond in cond_df.condition.unique():
        if cond is None:
            continue
        assert len(cond_df[cond_df.condition == cond]) == 8


Found 121 timepoints, TR=2.5 seconds


In [25]:
# find the condition label for timepoints that are beyond the intial 4 seconds
def get_cond_info(layout, deriv_layout, t_r, cleaned_images,
                  blocklen=20, n_trials_to_skip=2):
    cond_info = {}

    subjects = [int(sub) for sub in layout.get_subjects()]
    for subject in subjects:
        runs = get_subject_runs(subject, data_dir)
        cond_info[subject] = {}
        for run in runs:
            events_file = layout.get(subject=subject, run=run, datatype='func', extension='tsv', 
                                    return_type='file')[0]
            events = pd.read_csv(events_file, sep='\t')
            n_timepoints = cleaned_images[subject][run][1].shape[-1]
            timepoints = np.arange(0, n_timepoints * t_r, t_r)

            # find task onsets in the events file
             # skip 2 trials i.e. 4 seconds
            blocklen = 20 # block length in seconds after removing first 4 seconds

            conditions = events.trial_type.unique().tolist()
            conditions.sort()
            onsets = {}
            for condition in conditions:
                match_df = events[events.trial_type == condition]
                onsets[condition] = match_df.onset.tolist()[n_trials_to_skip]     
            cond_df = pd.DataFrame({'timepoint': timepoints, 'condition': None})
            for idx in cond_df.index:
                for condition in conditions:
                    if cond_df.loc[idx, 'timepoint'] >= onsets[condition] and cond_df.loc[idx, 'timepoint'] < (onsets[condition] + blocklen):
                        cond_df.loc[idx, 'condition'] = condition
            for cond in cond_df.condition.unique():
                if cond is None:
                    continue
                assert len(cond_df[cond_df.condition == cond]) == 8
            cond_info[subject][run] = cond_df
    return cond_info
        
cond_info = get_cond_info(layout, deriv_layout, t_r, cleaned_images)

In [47]:

def get_task_images(cond_info, cleaned_images):
    task_images = {}
    task_info = {}
    for subject, runs in cond_info.items():
        task_images[subject] = {}
        task_info[subject] = {}
        for run, cond_df in runs.items():
            good_trials = cond_df.dropna()
            assert len(good_trials) == 64, f"Found {len(good_trials)} good trials for subject {subject} run {run}"
            task_img_file = cleaned_images[subject][run][0].replace('cleaned', 'task')
            assert task_img_file != cleaned_images[subject][run][0]
            good_trials.to_csv(task_img_file.replace('_bold.nii.gz', '_events.tsv'), sep='\t', index=False)
            task_info[subject][run] = good_trials
            if not os.path.exists(task_img_file):
                cleaned_img = cleaned_images[subject][run][1]
                task_data = cleaned_img.get_fdata()[:, :, :, list(good_trials.index)]
                task_img = nib.Nifti1Image(task_data, cleaned_img.affine)
                task_img.to_filename(task_img_file)
            else:
                task_img = nib.load(task_img_file)
            task_images[subject][run] = task_img
    return task_images, task_info

task_images, task_info = get_task_images(cond_info, cleaned_images)


### save to HDF5 file

In [52]:
subjects = [int(sub) for sub in layout.get_subjects()]
use_difumo = False

output_dir = os.path.join(data_dir, 'derivatives', 'cleaned')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with h5py.File(os.path.join(output_dir, 'haxby_data_cleaned.h5'), 'w') as hf:

    for subject in subjects:
        g1 = hf.create_group(f'sub-{subject}')

        runs = get_subject_runs(subject, data_dir)
        for run in runs:
            g2 = g1.create_group(f'run-{run}')

            sub_mask = get_subject_common_brain_mask(subject, data_dir)

            # get data from the difumo visual mask
            difumo_mask = resample_img(get_difumo_mask(), sub_mask.affine, sub_mask.shape, 
                                interpolation='nearest')
            mask = nib.Nifti1Image(np.logical_and(sub_mask.get_fdata().astype('int32'), 
                                                  difumo_mask.get_fdata().astype('int32')).astype('int32'), 
                                   sub_mask.affine) 

            masker = NiftiMasker(mask_img=mask)
            difumo_data = masker.fit_transform(task_images[subject][run])
            assert difumo_data.shape[0] == task_images[subject][run].shape[-1]
            assert difumo_data.shape[1] == np.sum(mask.get_fdata())
            g2.create_dataset(f'vtmaskdata',data=difumo_data)

            # get the whole brain data
            masker = NiftiMasker(mask_img=sub_mask)
            braindata = masker.fit_transform(task_images[subject][run])
            assert braindata.shape[0] == task_images[subject][run].shape[-1]
            assert braindata.shape[1] == np.sum(sub_mask.get_fdata())
            g2.create_dataset(f'braindata',data=braindata)

            g2.create_dataset(f'conditions', data=[c.encode('utf-8') for c in task_info[subject][run].condition.tolist()])



  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force_resample(force_resample)
  force_resample = _check_force

In [63]:
import utils

with h5py.File(os.path.join(output_dir, 'haxby_data_cleaned.h5'), 'r') as hf:

    #utils.list_all_datasets(hf)
    print(hf['sub-1/run-1']['braindata'].shape)
    print(hf['sub-1/run-1']['vtmaskdata'].shape)
    print(hf['sub-1/run-1']['conditions'].shape)


(64, 217452)
(64, 18609)
(64,)
