# HCP Development Data

---

### package imports and basic functions

---

In [1]:
import os
import gc
import sys
import glob
import json
import random
import datetime
import importlib
import itertools
import numpy as np
from scipy import spatial
import scipy.sparse as sparse
import scipy.stats as stats
import pandas as pd
import nibabel as nib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import boto3


In [2]:
class MyNumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)


def ensure_dir(file_name):
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    return file_name


def list_dirs(path=os.getcwd()):
    files = glob.glob(os.path.join(path, '*'))
    files = [x for x in files if os.path.isdir(x)]
    return files


def file_exists(file_name, path_name=os.getcwd()):
    return os.path.isfile(os.path.join(path_name, file_name))


def write_json(json_obj, file_path):
    with open(file_path, 'w') as outfile:
        json.dump(json_obj, outfile, sort_keys=True, indent=4,
                  cls=MyNumpyEncoder)
    return json_obj


def write_np(np_obj, file_path):
    with open(file_path, 'wb') as outfile:
        np.save(outfile, np_obj)


In [3]:
# path setting
main_dir = os.path.abspath('../../..')


## Downloading data from NDA s3 bucket storage

---

In [None]:
# downloaded by:
# downloadcmd -dp 1205192 -u 'ndehestanikolag' -d AllHCPdataAgingDevelop -wt 8


In [None]:
%%bash

# get list of s3 links to be downloaded
cat /mountpoint/data/NDA/AllHCPdataAgingDevelop/datastructure_manifest.txt \
| grep PreprocStrucRecommended \
| cut -f6 \
| sed 's/"//g' \
| grep MNINonLinear/fsaverage_LR32k \
| grep thickness_MSMAll.32k_fs_LR.dscalar.nii \
| grep HCD \
> /mountpoint/data/NDA/HCPD_thickness_s3links.txt
# | head


In [None]:
%%bash

# make folder to store data in
mkdir /mountpoint/data/HCP_Development


In [None]:
%%bash

# Download the related files:
# cd /mountpoint/data/
# downloadcmd -dp 1205202 -t /home/sina/Documents/Research/Codes/NDA/HCPD_thickness_s3links.txt  -u 'ndehestanikolag' -d HCP_Development -wt 8

# other detail in: /mountpoint/code/environments/venv_3.8.10/lib/python3.8/site-packages/NDATools/clientscripts/config/settings.cfg


## Extracting data

---

In [4]:
hcpd_dir = '/mountpoint/data/HCP_Development/fmriresults01'
hcpd_subjects = [x.split('/')[-1] for x in list_dirs(hcpd_dir)]
len(hcpd_subjects)


652

In [5]:
hcpd_valid_subjects = [
    subject for subject in hcpd_subjects
    if file_exists(
        f'{hcpd_dir}/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii',''
    )
]
len(hcpd_valid_subjects)


652

In [6]:
# ignore warning
nib.imageglobals.logger.setLevel(40)


In [7]:
%%time
# create a numpy array to populate
hcpd_data = np.zeros((len(hcpd_valid_subjects), 59412))

for idx, subject in enumerate(hcpd_valid_subjects):
    hcpd_data[idx] = nib.load(f'{hcpd_dir}/{subject}/MNINonLinear/fsaverage_LR32k/{subject}.thickness_MSMAll.32k_fs_LR.dscalar.nii').get_fdata().reshape(-1)


CPU times: user 2min 3s, sys: 7.41 s, total: 2min 10s
Wall time: 2min 10s


In [8]:
hcpd_demography = pd.read_csv(
    f'/mountpoint/data/NDA/AllHCPdataAgingDevelop/fmriresults01.txt',
    delimiter='\t',
    skiprows=[1],
    header=0
)


In [9]:
hcpd_ages = np.array(
    [
        float(hcpd_demography[hcpd_demography['src_subject_id'] == (subject[:-6])]['interview_age'].values[0])/12
        for subject in hcpd_valid_subjects
    ]
)


In [10]:
gender_dict = {'M': 0, 'F': 1}
hcpd_genders = np.array(
    [
        float(gender_dict[hcpd_demography[hcpd_demography['src_subject_id'] == (subject[:-6])]['sex'].values[0]])
        for subject in hcpd_valid_subjects
    ]
)


## Storing cleaned data

---

In [11]:
# mean thickness stored as csv
hcpd_mean_thickness = np.mean(hcpd_data, axis=1)
hcpd_df = pd.DataFrame({'age': hcpd_ages, 'thickness': hcpd_mean_thickness, 'sex': hcpd_genders})
dataset_name = 'HCP-D'
hcpd_df['dataset'] = dataset_name
hcpd_df.to_csv(ensure_dir(f'{main_dir}/data/csv/demography_{dataset_name}.csv'))


In [12]:
# valid subject names as json
write_json(hcpd_valid_subjects, ensure_dir(f'{main_dir}/data/json/valid_subjects_{dataset_name}.json'));


In [13]:
# high-resolution thickness as npy
write_np(hcpd_data, ensure_dir(f'{main_dir}/data/npy/thickness_{dataset_name}.npy'));
