# HCP Young Adult Data

---

### package imports and basic functions

---

In [1]:
import os
import gc
import sys
import glob
import json
import random
import datetime
import importlib
import itertools
import numpy as np
from scipy import spatial
import scipy.sparse as sparse
import scipy.stats as stats
import pandas as pd
import nibabel as nib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import boto3


In [2]:
class MyNumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)


def ensure_dir(file_name):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    return file_name


def list_dirs(path=os.getcwd()):
    files = glob.glob(os.path.join(path, '*'))
    files = [x for x in files if os.path.isdir(x)]
    return files


def file_exists(file_name, path_name=os.getcwd()):
    return os.path.isfile(os.path.join(path_name, file_name))


def write_json(json_obj, file_path):
    with open(file_path, 'w') as outfile:
        json.dump(json_obj, outfile, sort_keys=True, indent=4,
                  cls=MyNumpyEncoder)
    return json_obj


def write_np(np_obj, file_path):
    with open(file_path, 'wb') as outfile:
        np.save(outfile, np_obj)


In [3]:
# path setting
main_dir = os.path.abspath('../../..')


## Downloading data from HCP bucket storage

---

In [None]:
# session = boto3.session.Session(profile_name='hcp')
# s3 = session.client('s3')


In [None]:
# paginator = s3.get_paginator('list_objects_v2')

# subjects = [cur["Prefix"].split('/')[1] for page in paginator.paginate(Bucket='hcp-openaccess', Prefix="HCP_1200/", Delimiter="/") for cur in page.get("CommonPrefixes", [])]


In [None]:
# len(subjects)


In [None]:
# # download all MSMAll thickness values
# for subject in subjects:
#     try:
#         s3.download_file(
#             'hcp-openaccess',
#             f'HCP_1200/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii',
#             ensure_dir(f'/mountpoint/data/HCP_1200/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii'),
#         )
#     except:
#         print(f'NOTE: {subject} download failed')


## Extracting data

---

In [4]:
hcpya_dir = '/mountpoint/data/HCP_1200/'
hcpya_subjects = [x.split('/')[-1] for x in list_dirs(hcpya_dir)]
len(hcpya_subjects)


1096

In [5]:
hcpya_valid_subjects = [
    subject for subject in hcpya_subjects
    if file_exists(
        f'{hcpya_dir}/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii',''
    )
]
len(hcpya_valid_subjects)


1096

In [6]:
# ignore warning
nib.imageglobals.logger.setLevel(40)


In [6]:
%%time
# create a numpy array to populate
hcpya_data = np.zeros((len(hcpya_valid_subjects), 59412))

for idx, subject in enumerate(hcpya_valid_subjects):
    hcpya_data[idx] = nib.load(f'{hcpya_dir}/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii').get_fdata().reshape(-1)


pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-zero; setting 0 dims to 1
pixdim[1,2,3] should be non-

CPU times: user 2min 53s, sys: 5.48 s, total: 2min 59s
Wall time: 3min 44s


In [8]:
unrestricted = pd.read_csv(f'{hcpya_dir}/unrestricted_sination_5_23_2019_21_35_35.csv')
restricted = pd.read_csv(f'{hcpya_dir}/0.RESTRICTED_sination_5_23_2019_21_32_45.csv')


In [9]:
hcpya_ages = np.array(
    [
        float(restricted[restricted['Subject'] == int(subject)]['Age_in_Yrs'])
        for subject in hcpya_valid_subjects
    ]
)


In [10]:
hcpya_genders = np.array(
    [
        float(unrestricted[unrestricted['Subject'] == int(subject)]['Gender'].copy().replace(['M', 'F'], [0, 1]))
        for subject in hcpya_valid_subjects
    ]
)


In [11]:
hcpya_ids = np.array(
    [
        str(unrestricted[unrestricted['Subject'] == int(subject)]['Subject'].values[0])
        for subject in hcpya_valid_subjects
    ]
)


## Storing cleaned data

---

In [12]:
# mean thickness stored as csv
hcpya_mean_thickness = np.mean(hcpya_data, axis=1)
hcpya_df = pd.DataFrame({'age': hcpya_ages, 'thickness': hcpya_mean_thickness, 'sex': hcpya_genders, 'subject_ID': hcpya_ids})
dataset_name = 'HCP-YA'
hcpya_df['dataset'] = dataset_name
hcpya_df.to_csv(ensure_dir(f'{main_dir}/data/csv/demography_{dataset_name}.csv'))


In [13]:
# valid subject names as json
write_json(hcpya_valid_subjects, ensure_dir(f'{main_dir}/data/json/valid_subjects_{dataset_name}.json'));


In [14]:
# high-resolution thickness as npy
write_np(hcpya_data, ensure_dir(f'{main_dir}/data/npy/thickness_{dataset_name}.npy'));


In [15]:
hcpya_df.head()


Unnamed: 0,age,thickness,sex,subject_ID,dataset
0,32.0,2.636395,1.0,130518,HCP-YA
1,35.0,2.63385,1.0,192237,HCP-YA
2,31.0,2.704905,1.0,859671,HCP-YA
3,33.0,2.629196,0.0,158136,HCP-YA
4,22.0,2.760889,0.0,192136,HCP-YA
