## Data preparation for decoding models

We will use the data from Haxby et al., 2001.


In [8]:
import os
from nilearn import datasets, plotting
from nilearn.image import load_img, mean_img
from nilearn.maskers import NiftiMasker
from nilearn.glm.first_level import FirstLevelModel
import h5py
import numpy as np
import nibabel as nib
import datalad.api as dl
from bids import BIDSLayout
from nilearn.glm.first_level import make_first_level_design_matrix
import pandas as pd
import matplotlib.pyplot as plt



#### Get the data using datalad

In [9]:
data_dir = "/Users/poldrack/data_unsynced/ds000105"

# get the data
ds = dl.clone(
    path=data_dir,
    source="https://github.com/OpenNeuroDatasets/ds000105.git",
)
dl.get(dataset=data_dir, recursive=True)

get_fmriprep = False
if get_fmriprep:
    fmriprep_dir = os.path.join(data_dir, 'derivatives', 'fmriprep')
    dl.clone(
        path=fmriprep_dir,
        source='https://github.com/OpenNeuroDerivatives/ds000105-fmriprep.git')
    dl.get(dataset=fmriprep_dir, recursive=True)

[INFO] Ensuring presence of Dataset(/Users/poldrack/data_unsynced/ds000105) to get /Users/poldrack/data_unsynced/ds000105 


In [10]:
# also get the nilearn version which has the mask



In [11]:
# load the dataset using pybids and get runs for each subject

layout = BIDSLayout(data_dir)
bold_files = layout.get(datatype='func', extension='nii.gz', return_type='file')
subjects = layout.get_subjects()
sub_runs = {}
for subject in subjects:
    bold_files = layout.get(subject=subject, datatype='func', extension='nii.gz', return_type='file')
    sub_runs[subject] = layout.get_runs(subject=subject)
    print(f"Subject {subject} has {len(bold_files)} runs")


Subject 1 has 12 runs
Subject 2 has 12 runs
Subject 3 has 12 runs
Subject 4 has 12 runs
Subject 5 has 11 runs
Subject 6 has 12 runs


### Fit linear models for each subject/run


In [20]:


def get_subject_data(subject, run, layout):
    bold_file = layout.get(subject=subject, run=run, datatype='func', extension='nii.gz', return_type='file')[0]
    events_file = layout.get(subject=subject, run=run, datatype='func', extension='tsv', return_type='file')[0]

    anat_file = layout.get(subject=subject, datatype='anat', extension='nii.gz', return_type='file')[0]
    anat_img = load_img(anat_file)

    haxby_dataset = datasets.fetch_haxby(subjects=(int(subject)))
    mask_img = nib.load(haxby_dataset.mask_vt[0])
    mask_data = mask_img.get_fdata()

    bold_img = nib.load(bold_file)

    # the mask images from nilearn have a different orientation than the functional images
    # so we need to fix that
    mask_img_fixed = nib.Nifti1Image(mask_data.astype(np.int16), bold_img.affine)
    mask_img_fixed.to_filename('mask_fixed.nii.gz')


    events = pd.read_csv(events_file, sep='\t')
    n_scans = bold_img.shape[-1]
    # Define the sampling times for the design matrix
    t_r = 2.5
    frame_times = np.arange(n_scans) * t_r
    # Sample at the beginning of each acquisition.
    slice_time_ref = 0.0
    # We use a discrete cosine transform to model signal drifts.
    drift_model = "Cosine"
    # The cutoff for the drift model is 0.01 Hz.
    high_pass = 0.01
    # The hemodynamic response function
    hrf_model = "spm + derivative"

    design_matrix = make_first_level_design_matrix(
        frame_times,
        events,
        hrf_model=hrf_model,
        drift_model=drift_model,
        high_pass=high_pass,
    )

    fmri_glm = FirstLevelModel(t_r, noise_model='ar1', standardize=False, hrf_model=hrf_model)
    fmri_glm = fmri_glm.fit(bold_img, design_matrices=design_matrix)

    plot_slices = False
    z_map = {}
    conditions = events.trial_type.unique()
    conditions.sort()
    for condition in conditions:
        z_map[condition] = fmri_glm.compute_contrast(condition, output_type='z_score')
        if plot_slices:
            plotting.plot_stat_map(z_map[condition], threshold=3.0, display_mode='z', 
                                cut_coords=3, black_bg=True, title=condition,
                                bg_img=anat_img)
            plt.show()
    z_maps = nib.concat_images([z_map[condition] for condition in conditions])
    masker = NiftiMasker(mask_img=mask_img, standardize=True)
    data = masker.fit_transform(z_maps)
    assert data.shape == (len(conditions), mask_data.sum())
    return data, conditions

output_dir = os.path.join(data_dir, 'derivatives', 'glm')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with h5py.File(os.path.join(output_dir, 'vtmask_data.h5'), 'w') as hf:
    for subject in subjects:
        g1 = hf.create_group(f'sub-{subject}')
        for run in sub_runs[subject]:
            g2 = g1.create_group(f'run-{run}')
            print(f"Processing subject {subject} run {run}")
            data, conditions = get_subject_data(subject, run, layout)
            g2.create_dataset(f'vtmaskdata',data=data)
            g2.create_dataset(f'conditions', data=[c.encode('utf-8') for c in conditions])

Processing subject 1 run 01




Processing subject 1 run 02




Processing subject 1 run 03




Processing subject 1 run 04




Processing subject 1 run 05




Processing subject 1 run 06




Processing subject 1 run 07




Processing subject 1 run 08




Processing subject 1 run 09




Processing subject 1 run 10




Processing subject 1 run 11




Processing subject 1 run 12




Processing subject 2 run 01




Processing subject 2 run 02




Processing subject 2 run 03




Processing subject 2 run 04




Processing subject 2 run 05




Processing subject 2 run 06




Processing subject 2 run 07




Processing subject 2 run 08




Processing subject 2 run 09




Processing subject 2 run 10




Processing subject 2 run 11




Processing subject 2 run 12




Processing subject 3 run 01




Processing subject 3 run 02




Processing subject 3 run 03




Processing subject 3 run 04




Processing subject 3 run 05




Processing subject 3 run 06




Processing subject 3 run 07




Processing subject 3 run 08




Processing subject 3 run 09




Processing subject 3 run 10




Processing subject 3 run 11




Processing subject 3 run 12




Processing subject 4 run 01




Processing subject 4 run 02




Processing subject 4 run 03




Processing subject 4 run 04




Processing subject 4 run 05




Processing subject 4 run 06




Processing subject 4 run 07




Processing subject 4 run 08




Processing subject 4 run 09




Processing subject 4 run 10




Processing subject 4 run 11




Processing subject 4 run 12




Processing subject 5 run 01




Processing subject 5 run 02




Processing subject 5 run 03




Processing subject 5 run 04




Processing subject 5 run 05




Processing subject 5 run 06




Processing subject 5 run 07




Processing subject 5 run 08




Processing subject 5 run 09




Processing subject 5 run 10




Processing subject 5 run 11




Processing subject 6 run 01




Processing subject 6 run 02




Processing subject 6 run 03




Processing subject 6 run 04




Processing subject 6 run 05




Processing subject 6 run 06




Processing subject 6 run 07




Processing subject 6 run 08




Processing subject 6 run 09




Processing subject 6 run 10




Processing subject 6 run 11




Processing subject 6 run 12




In [23]:
with h5py.File(os.path.join(output_dir, 'vtmask_data.h5'), 'r') as hf:
    print(hf['sub-1']['run-01'])
    print(hf['sub-1']['run-01']['conditions'][:])

<HDF5 group "/sub-1/run-01" (2 members)>
[b'bottle' b'cat' b'chair' b'face' b'house' b'scissors' b'scrambledpix'
 b'shoe']


(8, 577)