### Imports

In [1]:
import os
import pandas as pd
from glob import glob
from os.path import join
from tqdm import tqdm_notebook as tqdm

### Helpers

In [2]:
def _files_exist(folder, filenames):
    for filename in filenames:
        filepath = join(folder, filename)
        if not os.path.isfile(filepath):
            return False
    return True

def _has_structural(subject_path):
    needed = ["gm_probseg.nii.gz", "subject_to_MNI_warp.nii.gz"]
    return _files_exist(join(subject_path, "structural"), needed)

def _get_tasks(subject_path):
    tasks = glob(join(subject_path, "*"))
    tasks = [os.path.basename(task_path) for task_path in tasks]
    return [task for task in tasks if task != "structural"]

def _validate_subject_task(subject_path, task):
    needed = ["onsets.csv", "normalized.nii.gz"]
    if not _has_structural(subject_path):
        return False
    return _files_exist(join(subject_path, task), needed)

def _map_time_session(time_session):
    _map = {"000": "s1", "2MO": "s2", "6MO": "s3", "12MO": "s4", "24MO": "s5"}
    return _map[time_session]

### Check if files exist

Need fMRI data, onsets, and structurals

In [3]:
root = "/Volumes/hd_4tb/raw"
rows = list()

for project_path in glob(join(root, "*")):
    project = os.path.basename(project_path)
    for time_path in glob(join(project_path, "*")):
        time_session = os.path.basename(time_path)
        for subject_path in glob(join(time_path, "*")):
            subject = os.path.basename(subject_path)
            tasks = _get_tasks(subject_path)
            for task in tasks:
                valid = _validate_subject_task(subject_path, task)
                row = {
                    "project": project,
                    "time_session": _map_time_session(time_session),
                    "subject": subject,
                    "task": task,
                    "data_exists": valid
                }
                rows.append(row)

files = pd.DataFrame(rows)
print(files.shape)
files = files[files.data_exists]
files.shape

(3372, 5)


(2921, 5)

### Add movement

In [4]:
movement_csv = "/Volumes/hd_4tb/project/movement/all.csv"
movement = pd.read_csv(movement_csv)
movement.dropna(inplace=True)
df = pd.merge(files, movement, on=["project", "subject", "time_session", "task"], how="left")
df.dropna(inplace=True)
df = df[df.discarded_vols < 38]
df.shape

(2421, 6)

### Dicom info

In [5]:
info_csv = "/Volumes/hd_4tb/project/info.csv"
info = pd.read_csv(info_csv)
info.time_session = info.time_session.map({"000": "s1", "2MO": "s2", "6MO": "s3", "12MO": "s4", "24MO": "s5"})
full = pd.merge(df, info, on=["project", "subject", "time_session", "task"], how="left")
full = full[((full.age.notnull()) & (full.gender.notnull()))]
full.shape

(2270, 34)

### Data available

In [6]:
full.task.value_counts()

gonogo          704
nonconscious    675
conscious       660
workingmemSB    131
workingmemMB    100
Name: task, dtype: int64

In [7]:
full.to_csv("/Volumes/hd_4tb/project/clean.csv", index=False)