In this notebook, the underlying data and subjects splits ect... will be prepared for each each task

In [1]:
import nibabel as nib
import nibabel.freesurfer.io as io

import numpy as np
import pandas as pd
import pickle

from ROIs import get_X
from Individual import Existing_Surf_Individual

from sklearn.model_selection import train_test_split

from nilearn import datasets, surface
import networkx as nx

The different tasks are saved in folders sst_raw_data, nback_raw_data, ect..., with each contrast saved individually stacked by subject, and in that folder a corresponding subjects.txt file with the corresponding subjects data to each ind.

In [2]:
contrast_inds = {
'sst': [
'correct_go',     # 0
'incorrect_go',
'correctlate_go',
'noresp_go',
'incorrectlate_go',
'correct_stop',
'incorrect_stop',
'ssd_stop',
'correct_go_vs_fixation',
'correct_stop_vs_correct_go',  # 9
'incorrect_stop_vs_correct_go', # 10
'any_stop_vs_correct_go', # 11
'correct_stop_vs_incorrect_stop', # 12
'incorrect_go_vs_correct_go', # 13
'incorrect_go_vs_incorrect_stop'], # 14

'nback': [
'2_back_posface',
'2_back_neutface',
'2_back_negface',
'2_back_place',
'0_back_posface',
'0_back_neutface',
'0_back_negface',
'0_back_place',
'cue',
'0_back', # 9
'2_back', # 10
'place',
'emotion',
'2_back_vs_0_back', # 13
'face_vs_place',
'emotion_vs_neutface',
'negface_vs_neutface',
'posface_vs_neutface']
}

In [3]:
def load_hemis(task, contrasts):
    
    dr = task + '_raw_data/'

    lh = []
    rh = []

    for i in contrasts:
        lh.append(nib.load(dr + 'cortical_lh_' + str(i) + '.mgz').get_fdata())
        rh.append(nib.load(dr + 'cortical_rh_' + str(i) + '.mgz').get_fdata())

    lh = np.squeeze(np.stack(lh))
    rh = np.squeeze(np.stack(rh))

    print(task, lh.shape, rh.shape)

    np.save('data/lh_' + task + '.npy', lh)
    np.save('data/rh_' + task + '.npy', rh)
    
    return lh, rh

def load_data_subjects(task):
    
    with open(task + '_raw_data/subjects.txt', 'r') as f:
        lines = f.readlines()
        subjects = [line.strip() for line in lines]
        
    return subjects
    

def make_destr_rois_csv(lh, rh, task, contrasts, contrast_inds):
    
    # Load subjects
    subjects = load_data_subjects(task)
    
    # Load in the destr atlas
    fs5_dr = '/usr/local/freesurfer/subjects/fsaverage5/label/'
    destr = Existing_Surf_Individual(lh_loc = fs5_dr + 'lh.aparc.a2009s.annot',
                                     rh_loc = fs5_dr + 'rh.aparc.a2009s.annot')

    labels = io.read_annot(fs5_dr + 'lh.aparc.a2009s.annot')[2]
    labels = [labels[i].decode("utf-8") for i in range(len(labels))][1:]

    # Get ROI values
    X = get_X(lh, rh, destr.parcels_lh, destr.parcels_rh)

    # Create a data frame
    data = pd.DataFrame()
    data['src_subject_id'] = subjects

    # Put into a dataframe, and label by name
    per_contrast = np.shape(X)[1] // len(contrasts) // 2
    cnt = 0

    for i in range(len(contrasts)):
        
        contrast_name = contrast_inds[task][contrasts[i]]
        for j in range(per_contrast):
            data[contrast_name + '.' + labels[j] + '.lh'] = X[:,cnt]
            cnt += 1

    for i in range(len(contrasts)):
        
        contrast_name = contrast_inds[task][contrasts[i]]
        for j in range(per_contrast):
            data[contrast_name + '.' + labels[j] + '.rh'] = X[:,cnt]
            cnt += 1

    # Save csv for this task
    loc = 'data/Destr_' + task + '.csv'
    data.to_csv(loc, index=False)
    print('saved destr rois at:', loc)

Load the data by task and contrast and save destr ROIs for each.

In [4]:
sst_contrasts = [0, 9, 10, 11, 12, 13, 14]
sst_lh, sst_rh = load_hemis('sst', sst_contrasts)

make_destr_rois_csv(sst_lh, sst_rh, 'sst', sst_contrasts, contrast_inds)

nback_contrasts = [9, 10, 13]
nback_lh, nback_rh = load_hemis('nback', nback_contrasts)

make_destr_rois_csv(nback_lh, nback_rh, 'nback', nback_contrasts, contrast_inds)

sst (7, 10242, 6204) (7, 10242, 6204)
saved destr rois at: data/Destr_sst.csv
nback (3, 10242, 6006) (3, 10242, 6006)
saved destr rois at: data/Destr_nback.csv


Next step is too investigate the different tasks in their own respective notebooks (nback_ml_exploration.ipynb, sst_ml_exploration.ipynb), removing severe outliers, establishing train test splits, and then determining the two ML pipelines (fast for use within search, and best).

In [5]:
def splits(task):
    
    # Load the train test splits as established by the relevant ml_exploration scripts
    loc = task + '_ml/'
    
    with open(loc + 'train_subjects.txt', 'r') as f:
        train_subjects = f.readlines()
        train_subjects = [s.strip() for s in train_subjects]
        
    with open(loc + 'test_subjects.txt', 'r') as f:
        test_subjects = f.readlines()
        test_subjects = [s.strip() for s in test_subjects]
        
    full_train_subjects = train_subjects.copy()
        
    # Generate a further validation set from the train set
    train_subjects, val_subjects = train_test_split(train_subjects,
                                                    test_size=.1,
                                                    random_state=1)
    
    # Save subject splits
    with open('data/train_subjects_' + task + '.txt', 'w') as f:
        for subject in train_subjects:
            f.write(subject + '\n')
            
    with open('data/val_subjects_' + task + '.txt', 'w') as f:
        for subject in val_subjects:
            f.write(subject + '\n')
            
    with open('data/test_subjects_' + task + '.txt', 'w') as f:
        for subject in test_subjects:
            f.write(subject + '\n')
            
    with open('data/full_train_subjects_' + task + '.txt', 'w') as f:
        for subject in full_train_subjects:
            f.write(subject + '\n')

    # Generate split inds from the corresponding data subject list
    subjects = load_data_subjects(task)

    train_inds = [subjects.index(train_subjects[i]) for i in range(len(train_subjects))]
    val_inds = [subjects.index(val_subjects[i]) for i in range(len(val_subjects))]
    test_inds = [subjects.index(test_subjects[i]) for i in range(len(test_subjects))]
    full_train_inds = [subjects.index(full_train_subjects[i]) for i in range(len(full_train_subjects))]
    
    print(task + ' train size:', len(train_inds))
    print(task + ' val size:', len(val_inds))
    print(task + ' test size:', len(test_inds))
    print(task + ' full train size:', len(full_train_inds))
    
    return (train_subjects, val_subjects, test_subjects,
            full_train_subjects, train_inds, val_inds,
            test_inds, full_train_inds)

def save_data_by_split(task, lh, rh, splits):
    '''Splits[4,5,6,7] = train, val, test, full_train inds'''
    
    hemi_data = [lh, rh]
    hemi_names = ['lh', 'rh']
    split_names = ['train', 'val', 'test', 'full_train']
    
    for i in range(len(hemi_data)):
        for j in range(len(split_names)):
            
            data = hemi_data[i][:,:,splits[4+j]]
            
            save_loc = 'data/' + split_names[j] + '_' + hemi_names[i] + '_' + task + '.pkl'
            with open(save_loc, 'wb') as f:
                pickle.dump(data, f)
                print('saved:', save_loc)

In [6]:
sst_splits = splits('sst')
nback_splits = splits('nback')

save_data_by_split('sst', sst_lh, sst_rh, sst_splits)
save_data_by_split('nback', nback_lh, nback_rh, nback_splits)

sst train size: 4635
sst val size: 515
sst test size: 1000
sst full train size: 5150
nback train size: 4472
nback val size: 497
nback test size: 1000
nback full train size: 4969
saved: data/train_lh_sst.pkl
saved: data/val_lh_sst.pkl
saved: data/test_lh_sst.pkl
saved: data/full_train_lh_sst.pkl
saved: data/train_rh_sst.pkl
saved: data/val_rh_sst.pkl
saved: data/test_rh_sst.pkl
saved: data/full_train_rh_sst.pkl
saved: data/train_lh_nback.pkl
saved: data/val_lh_nback.pkl
saved: data/test_lh_nback.pkl
saved: data/full_train_lh_nback.pkl
saved: data/train_rh_nback.pkl
saved: data/val_rh_nback.pkl
saved: data/test_rh_nback.pkl
saved: data/full_train_rh_nback.pkl


sst train size: 4635
sst val size: 515
sst test size: 1000
nback train size: 4472
nback val size: 497
nback test size: 1000

In [7]:
def save_targets_by_split(task, targets_df, target_cols, splits):
    '''Subjects splits are splits[0,1,2,3],
       target_cols should be a lsit'''
    
    targets_of_interest_df = targets_df[target_cols]
    
    # Save targets csvs by split
    train_subjects_df = targets_of_interest_df.loc[splits[0]]
    val_subjects_df = targets_of_interest_df.loc[splits[1]]
    test_subjects_df = targets_of_interest_df.loc[splits[2]]
    full_train_subjects_df = targets_of_interest_df.loc[splits[3]]
    
    save_loc = 'data/train_targets_' + task + '.csv'
    train_subjects_df.to_csv(save_loc)
    print('saved:', save_loc)
    
    save_loc = 'data/val_targets_' + task + '.csv'
    val_subjects_df.to_csv(save_loc)
    print('saved:', save_loc)
    
    save_loc = 'data/test_targets_' + task + '.csv'
    test_subjects_df.to_csv(save_loc)
    print('saved:', save_loc)
    
    save_loc = 'data/full_train_targets_' + task + '.csv'
    full_train_subjects_df.to_csv(save_loc)
    print('saved:', save_loc)
    
    print()
    
    # Save in correct formatting for usage in search
    train_targets = []
    for i in range(len(target_cols)):
        train_targets.append(np.array(train_subjects_df[target_cols[i]]))

    save_loc = 'data/train_targets_' + task + '.pkl'
    with open(save_loc, 'wb') as f:
        pickle.dump(train_targets, f)
    print('saved:', save_loc)
        
    val_targets = []
    for i in range(len(target_cols)):
        val_targets.append(np.array(val_subjects_df[target_cols[i]]))
        
    save_loc = 'data/val_targets_' + task + '.pkl'
    with open(save_loc, 'wb') as f:
        pickle.dump(val_targets, f)
    print('saved:', save_loc)
        
    test_targets = []
    for i in range(len(target_cols)):
        test_targets.append(np.array(test_subjects_df[target_cols[i]]))
    
    save_loc = 'data/test_targets_' + task + '.pkl'
    with open(save_loc, 'wb') as f:
        pickle.dump(test_targets, f)
    print('saved:', save_loc)
    
    full_train_targets = []
    for i in range(len(target_cols)):
        full_train_targets.append(np.array(full_train_subjects_df[target_cols[i]]))

    save_loc = 'data/full_train_targets_' + task + '.pkl'
    with open(save_loc, 'wb') as f:
        pickle.dump(full_train_targets, f)
    print('saved:', save_loc)
    print()

In [8]:
sst_targets_loc = '/mnt/Storage/To_Get/ABCD2p0NDA/abcd_sst02.txt'
sst_targets_df = pd.read_csv(sst_targets_loc, sep='\t', skiprows=[1], index_col='src_subject_id')
sst_target_cols = ['tfmri_sst_all_beh_total_meanrt']

save_targets_by_split('sst', sst_targets_df, sst_target_cols, sst_splits)

nback_targets_loc = '/home/sage/Parcel_Search/data/nBack_target_vals.csv'
nback_targets_df = pd.read_csv(nback_targets_loc, index_col='src_subject_id')
nback_target_cols = ['dprime_0back', 'dprime_2back']

save_targets_by_split('nback', nback_targets_df, nback_target_cols, nback_splits)

saved: data/train_targets_sst.csv
saved: data/val_targets_sst.csv
saved: data/test_targets_sst.csv
saved: data/full_train_targets_sst.csv

saved: data/train_targets_sst.pkl
saved: data/val_targets_sst.pkl
saved: data/test_targets_sst.pkl
saved: data/full_train_targets_sst.pkl

saved: data/train_targets_nback.csv
saved: data/val_targets_nback.csv
saved: data/test_targets_nback.csv
saved: data/full_train_targets_nback.csv

saved: data/train_targets_nback.pkl
saved: data/val_targets_nback.pkl
saved: data/test_targets_nback.pkl
saved: data/full_train_targets_nback.pkl



Create the geo file

In [9]:
fs5 = datasets.fetch_surf_fsaverage(mesh='fsaverage5')
FS_AVG5_SZ = len(surface.load_surf_data(fs5['pial_left'])[0])

geo = surface.load_surf_data(fs5['pial_left'])[1]
G = nx.Graph()
for tri in geo:
    G.add_edge(tri[0], tri[1])
    G.add_edge(tri[0], tri[2])
    G.add_edge(tri[1], tri[2])
    
geo = []

for i in range(len(G)):
    geo.append(list(G.neighbors(i)))

with open('data/geo.pkl', 'wb') as f:
    pickle.dump(geo, f)

This portion of the setup is now complete.