### Imports

In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob
from os.path import join
from tqdm import tqdm_notebook as tqdm

import os
import pandas as pd
from glob import glob
from os.path import join
from tqdm import tqdm_notebook as tqdm

### Helpers

In [None]:
def _files_exist(folder, filenames):
    for filename in filenames:
        filepath = join(folder, filename)
        if not os.path.isfile(filepath):
            return False
    return True

def _has_structural(subject_path):
    needed = ["gm_probseg.nii.gz", "subject_to_MNI_warp.nii.gz"]
    return _files_exist(join(subject_path, "structural"), needed)

def _get_tasks(subject_path):
    tasks = glob(join(subject_path, "*"))
    tasks = [os.path.basename(task_path) for task_path in tasks]
    return [task for task in tasks if task != "structural"]

def _validate_subject_task(subject_path, task):
    needed = ["onsets.csv", "normalized.nii.gz"]
    if not _has_structural(subject_path):
        return False
    return _files_exist(join(subject_path, task), needed)

def _map_time_session(time_session):
    _map = {"000": "s1", "2MO": "s2", "6MO": "s3", "12MO": "s4", "24MO": "s5"}
    return _map[time_session]

### Check if files exist

Need fMRI data, onsets, and structurals

In [None]:
root = "/Volumes/hd_4tb/raw"
rows = list()

for project_path in glob(join(root, "*")):
    project = os.path.basename(project_path)
    for time_path in glob(join(project_path, "*")):
        time_session = os.path.basename(time_path)
        for subject_path in glob(join(time_path, "*")):
            subject = os.path.basename(subject_path)
            tasks = _get_tasks(subject_path)
            for task in tasks:
                valid = _validate_subject_task(subject_path, task)
                row = {
                    "project": project,
                    "time_session": _map_time_session(time_session),
                    "subject": subject,
                    "task": task,
                    "data_exists": valid
                }
                rows.append(row)

files = pd.DataFrame(rows)
print(files.shape)
files = files[files.data_exists]
files.shape

### Add movement

In [None]:
movement_csv = "/Volumes/hd_4tb/project/movement/all.csv"
movement = pd.read_csv(movement_csv)
movement.dropna(inplace=True)
df = pd.merge(files, movement, on=["project", "subject", "time_session", "task"], how="left")
df.dropna(inplace=True)
df = df[df.discarded_vols < 38]
df.shape

### Dicom info

In [None]:
info_csv = "/Volumes/hd_4tb/project/info.csv"
info = pd.read_csv(info_csv)
info.time_session = info.time_session.map({"000": "s1", "2MO": "s2", "6MO": "s3", "12MO": "s4", "24MO": "s5"})
full = pd.merge(df, info, on=["project", "subject", "time_session", "task"], how="left")
full = full[((full.age.notnull()) & (full.gender.notnull()))]
full.shape

### Data available

In [None]:
full.to_csv("/Volumes/hd_4tb/project/clean.csv", index=False)
full.task.value_counts()

### Task

In [2]:
def _is_task(task, name):
    if task == "gonogo" and task == name:
        return True
    elif task == "conscious" and task == name:
        return True
    elif task == "nonconscious" and task == name:
        return True
    elif task in {"workingmemSB", "workingmemMB"} and name == "workingmem":
        return True
    return False

### Operators

In [3]:
operators = {
    "adam": {"^apines", "apines", "Pines^Adam", "^apine"},
    "katherine": {"kgrisanz", "^kgrisanz"},
    "monica": {"^mkullar", "mkullar", "^mkullat"},
    "norween": {"^nowreen", "nowreen"},
    "sarah": {"sarchang"},
    "melissa": {"mshiner"},
    "david": {"choidd", "chiodd"},
    "bailey": {"bholtgos"},
    "megan": {"mchesnut"},
}

def _is_operator(operator, name):
    names = operators[name]
    return operator in names

### fMRI

In [4]:
valid = pd.read_csv("/Volumes/hd_4tb/project/clean.csv")

def _get(row, item):
    return row[item]

### Slice timing

In [5]:
slice_df = pd.read_csv("/Volumes/hd_4tb/project/slice_timing/all.csv")

def _slice_timing(subject, session, task, check_ascending):
    if task.endswith("MB"):
        return False
    tasknum = {"gonogo": 1, "conscious": 3, "workingmemSB": 4, "nonconscious": 5}[task]
    selected = slice_df[(
        (slice_df.task == tasknum) &
        (slice_df.subject.str.lower() == subject) &
        (slice_df.session == int(session.replace("s", "")))
    )]
    if len(selected) == 0:
        return np.nan
    order = selected.order.values[0]
    assert order in {"ascending", "descending"}, "%s %s %s" % (subject, session, task)
    if check_ascending:
        return order == "ascending"
    else:
        return order == "descending"

### Gather info

In [6]:
def _info(row):
    ### Setup
    info = dict()
    subject      = _get(row, "subject")
    time_session = _get(row, "time_session")
    task         = _get(row, "task")
    
    ### fMRI
    is_mb = task in {"workingmemMB"}
    info["is_mb"] = is_mb
    info["is_ascending"] = _slice_timing(subject, time_session, task, True)
    info["is_descending"] = _slice_timing(subject, time_session, task, False)
    
    ### Task
    for task_name in ["gonogo", "conscious", "nonconscious", "workingmem"]:
        info["is_%s" % task_name] = _is_task(task, task_name)
        
    ### Operator
    misc_operator = True
    for operator in operators.keys():
        is_operator = _is_operator(_get(row, "operator"), operator)
        info["is_%s" % operator] = is_operator
        if is_operator:
            misc_operator = False
    info["is_misc_operator"] = misc_operator
    
    ### Project
    for project in ["connhc", "connmdd", "engage", "rad"]:
        info["is_%s" % project] = _get(row, "project") == project
        
    ### Time session
    for session in ["s1", "s2", "s3", "s4", "s5"]:
        info["is_%s" % session] = time_session == session
        
    ### Age, Gender
    info["age"] = _get(row, "age")
    info["is_male"] = _get(row, "gender") == "male"
    info["is_female"] = _get(row, "gender") == "female"
    return info

### Main

In [7]:
rows = list()
for i, row in tqdm(valid.iterrows()):
    rows.append(_info(row))
    
df = pd.DataFrame(rows)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Check spread of inputs

In [8]:
df.sum()

age                 91560.988547
is_adam               628.000000
is_ascending         1287.000000
is_bailey              81.000000
is_connhc             175.000000
is_connmdd            285.000000
is_conscious          660.000000
is_david              102.000000
is_descending          75.000000
is_engage            1002.000000
is_female            1425.000000
is_gonogo             704.000000
is_katherine          435.000000
is_male               818.000000
is_mb                 100.000000
is_megan               53.000000
is_melissa            133.000000
is_misc_operator      125.000000
is_monica             289.000000
is_nonconscious       675.000000
is_norween            252.000000
is_rad                808.000000
is_s1                1534.000000
is_s2                 220.000000
is_s3                 219.000000
is_s4                 198.000000
is_s5                  99.000000
is_sarah              172.000000
is_workingmem         231.000000
dtype: float64

### Check no null inputs

In [9]:
df.shape, df[df.notnull()].shape

((2270, 29), (2270, 29))

### Save df

In [10]:
df.to_csv("/Volumes/hd_4tb/project/model_input.csv", index=False)