## Data preparation for encoding/decoding models

For this example, we will use the data from Haxby et al., 2001., which are shared via OpenNeuro:

https://openneuro.org/datasets/ds000105/versions/3.0.0

The data are formatted according to the BIDS standard: https://bids-specification.readthedocs.io/en/stable/index.html

First, import required dependencies. You can install these using `pip install -r requirements.txt` from the main repo directory.

In [2]:
import os
from nilearn import datasets, plotting
from nilearn.image import load_img, mean_img, resample_img
from nilearn.maskers import NiftiMasker
from nilearn.glm.first_level import FirstLevelModel
from nilearn.glm.second_level import SecondLevelModel
from nilearn.plotting import plot_stat_map, plot_design_matrix
import h5py
import numpy as np
import nibabel as nib
import datalad.api as dl
from bids import BIDSLayout
from nilearn.glm.first_level import make_first_level_design_matrix
import pandas as pd
import matplotlib.pyplot as plt
from templateflow import api as tflow
import templateflow
from utils import get_difumo_mask

#### Get the data using datalad

We will use a tool called [Datalad](https://www.datalad.org/) to obtain the data from openneuro. 

We will download the raw data, as well as the processed data (using [fMRIPrep](https://fmriprep.org/en/stable/).  Note that downloading these derivative data can take quite a while depending on the speed of one's connection.  

In [4]:
data_dir = "/Users/poldrack/data_unsynced/ds000105"
assert os.path.exists(data_dir), "Data directory not found: %s" % data_dir

output_dir = os.path.join(data_dir, 'derivatives', 'glm')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# get the raw data
ds = dl.clone(
    path=data_dir,
    source="https://github.com/OpenNeuroDatasets/ds000105.git",
)
dl.get(dataset=data_dir, recursive=True)

get_fmriprep = False  #set to false after downloading fmriprep once
fmriprep_dir = os.path.join(data_dir, 'derivatives', 'fmriprep')

# get the preprocessed derivatives - this takes some time!
if get_fmriprep:
    dl.clone(
        path=fmriprep_dir,
        source='https://github.com/OpenNeuroDerivatives/ds000105-fmriprep.git')
    dl.get(dataset=fmriprep_dir, recursive=True)

[INFO] Ensuring presence of Dataset(/Users/poldrack/data_unsynced/ds000105) to get /Users/poldrack/data_unsynced/ds000105 


### Query the dataset using PyBIDS

Because the dataset is organized using the BIDS standard, we can use the [PyBIDS](https://bids-standard.github.io/pybids/) tool to query the dataset and obtain useful metadata.


In [20]:

bold_files

['/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-1_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-2_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-3_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-4_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-5_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data_unsynced/ds000105/derivatives/fmriprep/sub-1/func/sub-1_task-objectviewing_run-6_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz',
 '/Users/poldrack/data

In [21]:
# load the dataset using pybids and get runs for each subject

layout = BIDSLayout(data_dir, config=['bids', 'derivatives'])
deriv_layout = BIDSLayout(fmriprep_dir, derivatives=True, validate=False)

bold_files = deriv_layout.get(datatype='func', desc='preproc', suffix='bold',
                              space='MNI152NLin2009cAsym',
                              extension='nii.gz', return_type='file')

print(f'found {len(bold_files)} preprocessed files')

subjects = deriv_layout.get_subjects()
sub_runs = {}
for subject in subjects:
    bold_files = deriv_layout.get(datatype='func', desc='preproc', suffix='bold',
                              space='MNI152NLin2009cAsym',
                              subject=subject,
                              extension='nii.gz', return_type='file')
    sub_runs[subject] = deriv_layout.get_runs(subject=subject)
    print(f"Subject {subject} has {len(bold_files)} runs")


Example contents of 'dataset_description.json':
{"Name": "Example dataset", "BIDSVersion": "1.0.2", "GeneratedBy": [{"Name": "Example pipeline"}]}


found 71 preprocessed files
Subject 1 has 12 runs
Subject 2 has 12 runs
Subject 3 has 12 runs
Subject 4 has 12 runs
Subject 5 has 11 runs
Subject 6 has 12 runs


### Fit linear models for each subject/run


In [7]:


def get_subject_data(subject, run, layout, deriv_layout, save_maps=True):
    run = int(run)
    bold_file = os.path.join(
        fmriprep_dir, 
        f'sub-{subject}/func/sub-{subject}_task-objectviewing_run-{run}_space-MNI152NLin2009cAsym_res-2_desc-preproc_bold.nii.gz')
    bold_img = nib.load(bold_file)

    mask_file = os.path.join(
        fmriprep_dir, 
        f'sub-{subject}/func/sub-{subject}_task-objectviewing_run-{run}_space-MNI152NLin2009cAsym_res-2_desc-brain_mask.nii.gz')
    mask_img = nib.load(mask_file)
    mask_data = mask_img.get_fdata().astype(bool)

    events_file = layout.get(subject=subject, run=run, datatype='func', extension='tsv', return_type='file')[0]

    events = pd.read_csv(events_file, sep='\t')
    n_scans = bold_img.shape[-1]
    # Define the sampling times for the design matrix
    t_r = 2.5
    frame_times = np.arange(n_scans) * t_r
    # Sample at the beginning of each acquisition.
    slice_time_ref = 0.0
    # We use a discrete cosine transform to model signal drifts.
    drift_model = "Cosine"
    # The cutoff for the drift model is 0.01 Hz.
    high_pass = 0.01
    # The hemodynamic response function
    hrf_model = "spm + derivative"

    design_matrix = make_first_level_design_matrix(
        frame_times,
        events,
        hrf_model=hrf_model,
        drift_model=drift_model,
        high_pass=high_pass,
    )

    fmri_glm = FirstLevelModel(t_r, noise_model='ar1', standardize=False, hrf_model=hrf_model)
    fmri_glm = fmri_glm.fit(bold_img, design_matrices=design_matrix)

    plot_slices = False

    conditions = events.trial_type.unique()
    conditions.sort()
    z_map = {}
    for condition in conditions:
        contrast = fmri_glm.compute_contrast(condition, output_type='all')
        z_map[condition] = contrast['z_score']
        if save_maps:
            contrast['z_score'].to_filename(os.path.join(output_dir, f'sub-{subject}_run-{run}_zmap_{condition}.nii.gz'))
            contrast['effect_size'].to_filename(os.path.join(output_dir, f'sub-{subject}_run-{run}_beta_{condition}.nii.gz'))

    z_maps = nib.concat_images([z_map[condition] for condition in conditions])
    
    difumo_mask = get_difumo_mask()
    difumo_mask = resample_img(difumo_mask, target_affine=z_maps.affine, 
                               target_shape=z_maps.shape[:3],
                               interpolation='nearest')
    masker = NiftiMasker(mask_img=difumo_mask, standardize=True, target_affine=z_maps.affine)
    data = masker.fit_transform(z_maps)
    assert data.shape == (len(conditions), difumo_mask.get_fdata().sum())
    return data, conditions




with h5py.File(os.path.join(output_dir, 'visctx_data.h5'), 'w') as hf:
    for subject in subjects:
        g1 = hf.create_group(f'sub-{subject}')
        for run in sub_runs[subject]:
            g2 = g1.create_group(f'run-{run}')
            print(f"Processing subject {subject} run {run}")
            data, conditions = get_subject_data(subject, run, layout, fmriprep_dir)
            g2.create_dataset(f'voxdata',data=data)
            g2.create_dataset(f'conditions', data=[c.encode('utf-8') for c in conditions])

Processing subject 1 run 01
Processing subject 1 run 02
Processing subject 1 run 03
Processing subject 1 run 04
Processing subject 1 run 05
Processing subject 1 run 06
Processing subject 1 run 07
Processing subject 1 run 08
Processing subject 1 run 09
Processing subject 1 run 10
Processing subject 1 run 11
Processing subject 1 run 12
Processing subject 2 run 01
Processing subject 2 run 02
Processing subject 2 run 03
Processing subject 2 run 04
Processing subject 2 run 05
Processing subject 2 run 06
Processing subject 2 run 07
Processing subject 2 run 08
Processing subject 2 run 09
Processing subject 2 run 10
Processing subject 2 run 11
Processing subject 2 run 12
Processing subject 3 run 01
Processing subject 3 run 02
Processing subject 3 run 03
Processing subject 3 run 04
Processing subject 3 run 05
Processing subject 3 run 06
Processing subject 3 run 07
Processing subject 3 run 08
Processing subject 3 run 09
Processing subject 3 run 10
Processing subject 3 run 11
Processing subject 3

  Y, _ = mean_scaling(Y, self.signal_scaling)


Processing subject 6 run 03
Processing subject 6 run 04
Processing subject 6 run 05
Processing subject 6 run 06
Processing subject 6 run 07
Processing subject 6 run 08
Processing subject 6 run 09
Processing subject 6 run 10
Processing subject 6 run 11
Processing subject 6 run 12


In [9]:
# combine across runs within each subject

for subject in subjects:
    for condition in conditions:
        beta_files = [os.path.join(output_dir, f'sub-{subject}_run-{int(run)}_beta_{condition}.nii.gz') for run in sub_runs[subject]]
        model = SecondLevelModel()
        model.fit(beta_files, design_matrix=pd.DataFrame([1] * len(beta_files), columns=['intercept']))
        z_map = model.compute_contrast(output_type='effect_size')
        z_map.to_filename(os.path.join(output_dir, f'sub-{subject}_meanbeta_{condition}.nii.gz'))

In [10]:
# run group model

for condition in conditions:
    beta_files = [os.path.join(output_dir, f'sub-{subject}_meanbeta_{condition}.nii.gz') for subject in subjects]
    model = SecondLevelModel()
    model.fit(beta_files, design_matrix=pd.DataFrame([1] * len(beta_files), columns=['intercept']))
    z_map = model.compute_contrast(output_type='z_score')
    z_map.to_filename(os.path.join(output_dir, f'group_zmap_{condition}.nii.gz'))
