In [1]:
import cloudknot as ck
import itertools
import numpy as np
import pandas as pd

In [2]:
input_dirs = ["b0-tensorfa-dwiqc"]

In [3]:
gold_standard = pd.read_csv("expert-ratings/expert_ratings.csv")
report_subs = ["sub-" + sub for sub in gold_standard["subject"]]
len(report_subs)

200

In [14]:
def create_tfrecs(s3_input_dir, seed, report_subs):
    import nobrainer
    import numpy as np
    import os
    import os.path as op
    import pandas as pd
    import re
    
    from glob import glob
    from s3fs import S3FileSystem

    # Download the QC scores from S3 FCP-INDI
    df_qc = pd.read_csv(
        "s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv",
        sep="\t",
        index_col="subject_id"
    )

    # Download nifti files from S3 to local
    local_nifti_dir = "niftis"
    local_tfrec_dir = "tfrecs"
    os.makedirs(local_nifti_dir, exist_ok=True)
    os.makedirs(local_tfrec_dir, exist_ok=True)
        
    fs = S3FileSystem()
    fs.get(f"hbn-pod2-deep-learning/{s3_input_dir}", local_nifti_dir, recursive=True)

    nifti_files = [op.abspath(filename) for filename in glob(f"{local_nifti_dir}/*.nii.gz")]
    nifti_files = [fn for fn in nifti_files if "irregularsize" not in fn]
    sub_id_pattern = re.compile("sub-[a-zA-Z0-9]*")
    subjects = [sub_id_pattern.search(s).group(0) for s in nifti_files]
    
    df_nifti = pd.DataFrame(data=nifti_files, index=subjects, columns=["features"])
    df_nifti = df_nifti.merge(df_qc, left_index=True, right_index=True, how="left")
    df_nifti.drop("scan_site_id", axis="columns", inplace=True)
    df_nifti.rename(columns={"fibr + qsiprep rating": "labels"}, inplace=True)
    df_nifti.dropna(inplace=True)

    df_report = df_nifti.filter(report_subs, axis="index")
    df_train_validate_test = df_nifti.drop(report_subs, axis="index", errors="ignore")

    filepaths = list(df_nifti.itertuples(index=False, name=None))
    report_filepaths = list(df_report.itertuples(index=False, name=None))
    train_validate_test_filepaths = list(df_train_validate_test.itertuples(index=False, name=None))
    
    n_channels = {
        "b0-colorfa-rgb": 3,
        "combined": 4,
        "b0-tensorfa-dwiqc": 5,
    }
    
    # Verify that all volumes have the same shape
    invalid = nobrainer.io.verify_features_labels(
        filepaths, volume_shape=(128, 128, 128, n_channels[s3_input_dir]),
        check_labels_int=False
    )
    print("Invalid:", invalid)
    assert not invalid    
    
    # Compute training, validation and test indices
    n_files = len(train_validate_test_filepaths)
    n_train_start, n_train_stop = 0, n_files * 8 // 10
    n_val_start, n_val_stop = n_train_stop, n_train_stop + n_files // 10
    n_test_start = n_val_stop
    
    # Save different sets of shuffled data
    rng = np.random.default_rng(seed=seed)
    shuffled_paths = list(rng.permutation(train_validate_test_filepaths))

    train_paths = shuffled_paths[n_train_start:n_train_stop]
    validate_paths = shuffled_paths[n_val_start:n_val_stop]
    test_paths = shuffled_paths[n_test_start:]

    os.makedirs(f"{local_tfrec_dir}/seed_{seed}", exist_ok=True)

    nobrainer.tfrecord.write(
        features_labels=train_paths,
        filename_template=f"{local_tfrec_dir}/seed_{seed}" + "/data-train_shard-{shard:03d}.tfrec",
        examples_per_shard=20
    )

    nobrainer.tfrecord.write(
        features_labels=validate_paths,
        filename_template=f"{local_tfrec_dir}/seed_{seed}" + "/data-validate_shard-{shard:03d}.tfrec",
        examples_per_shard=20
    )

    nobrainer.tfrecord.write(
        features_labels=test_paths,
        filename_template=f"{local_tfrec_dir}/seed_{seed}" + "/data-test_shard-{shard:03d}.tfrec",
        examples_per_shard=20
    )

    nobrainer.tfrecord.write(
        features_labels=report_filepaths,
        filename_template=f"{local_tfrec_dir}/seed_{seed}" + "/data-report_shard-{shard:03d}.tfrec",
        examples_per_shard=20
    )
    
    output_s3_dirs = {
        "b0-colorfa-rgb": "tfrecs/b0-colorfa-rgb",
        "combined": "tfrecs/b0-colorfa-4channel",
        "b0-tensorfa-dwiqc": "tfrecs/b0-tensorfa-dwiqc"
    }

    fs = S3FileSystem()
    fs.put(
        local_tfrec_dir,
        f"hbn-pod2-deep-learning/{output_s3_dirs[s3_input_dir]}/seed_{seed}",
        recursive=True
    )

In [15]:
di = ck.DockerImage(
    func=create_tfrecs,
    base_image="python:3.8",
    github_installs="https://github.com/richford/nobrainer.git@enh/four-d",
    overwrite=True
)



In [16]:
di.build(tags=["hbn-pod2-tfrecs-20210824"])

In [17]:
repo = ck.aws.DockerRepo(name=ck.get_ecr_repo())

In [18]:
# The very first time you run this, this command could take a few minutes
di.push(repo=repo)

In [10]:
# Specify bid_percentage to use Spot instances
# And make sure the volume size is large enough. 55-60 GB seems about right for HBN preprocessing. YMMV.
knot = ck.Knot(
    name=f"hbn-pod2-tfrecs-20210824-0",
    docker_image=di,
    pars_policies=("AmazonS3FullAccess",),
    bid_percentage=100,
    memory=8000,
    job_def_vcpus=8,
    volume_size=100,
    max_vcpus=64,
    retries=1,
    aws_resource_tags={"Project": "HBN-FCP-INDI"},
)

In [11]:
seeds = np.arange(10)
args = list(itertools.product(
    input_dirs,
    seeds,
))
args = [arg + (report_subs,) for arg in args]
args

[('b0-tensorfa-dwiqc',
  0,
  ['sub-NDARAC857HDB',
   'sub-NDARAH304ED7',
   'sub-NDARAJ689BVN',
   'sub-NDARAP785CTE',
   'sub-NDARAU530GLJ',
   'sub-NDARAV187GJ5',
   'sub-NDARAX272ZJL',
   'sub-NDARAX277ATU',
   'sub-NDARBE719PMB',
   'sub-NDARBG574KF4',
   'sub-NDARBN620TT7',
   'sub-NDARBU183TDJ',
   'sub-NDARBU928LV0',
   'sub-NDARBV577EE0',
   'sub-NDARBW026UGE',
   'sub-NDARBW268XPY',
   'sub-NDARCD182XT1',
   'sub-NDARCG785NND',
   'sub-NDARCJ330NBP',
   'sub-NDARCK815WZR',
   'sub-NDARCU736GZ1',
   'sub-NDARCU865PBV',
   'sub-NDARCW946WNE',
   'sub-NDARDA472JE3',
   'sub-NDARDE319VD1',
   'sub-NDARDE877RFH',
   'sub-NDARDJ204EPU',
   'sub-NDARDK794WV3',
   'sub-NDARDL291DN4',
   'sub-NDARDR591AUC',
   'sub-NDARDR950CGG',
   'sub-NDARDT499DWP',
   'sub-NDARDU853XZ6',
   'sub-NDARDW178AC6',
   'sub-NDARDW416KWZ',
   'sub-NDARDX857DLB',
   'sub-NDARDY741VEB',
   'sub-NDARDZ425JVB',
   'sub-NDAREC277JCP',
   'sub-NDAREC648WEL',
   'sub-NDAREE015WGU',
   'sub-NDAREF624KJN',
   'su

In [12]:
len(args)

10

In [19]:
results = knot.map(args, starmap=True)

In [20]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
dc5c47af-7c28-47e4-baf1-233ce2116a9f        hbn-pod2-tfrecs-20210824-0-0        FAILED   
d8dbee14-1455-48f7-9cfd-42a23ba043b1        hbn-pod2-tfrecs-20210824-0-1        SUCCEEDED


In [21]:
knot.clobber(clobber_pars=True)