In [1]:
import cloudknot as ck
import itertools
import numpy as np
import pandas as pd

In [2]:
input_dirs = ["b0-tensorfa-dwiqc"]

In [3]:
def create_tfrecs(s3_input_dir):
    import nobrainer
    import numpy as np
    import os
    import os.path as op
    import pandas as pd
    import re
    
    from glob import glob
    from s3fs import S3FileSystem

    # Download the QC scores from S3 FCP-INDI
    df_qc = pd.read_csv(
        "s3://fcp-indi/data/Projects/HBN/BIDS_curated/derivatives/qsiprep/participants.tsv",
        sep="\t",
        index_col="subject_id"
    )

    # Download nifti files from S3 to local
    local_nifti_dir = "niftis"
    local_tfrec_dir = "tfrecs"
    os.makedirs(local_nifti_dir, exist_ok=True)
    os.makedirs(local_tfrec_dir, exist_ok=True)
        
    fs = S3FileSystem()
    fs.get(f"hbn-pod2-deep-learning/{s3_input_dir}", local_nifti_dir, recursive=True)

    nifti_files = [op.abspath(filename) for filename in glob(f"{local_nifti_dir}/*.nii.gz")]
    nifti_files = [fn for fn in nifti_files if "irregularsize" not in fn]
    sub_id_pattern = re.compile("sub-[a-zA-Z0-9]*")
    subjects = [sub_id_pattern.search(s).group(0) for s in nifti_files]
    
    df_nifti = pd.DataFrame(data=nifti_files, index=subjects, columns=["features"])
    df_nifti = df_nifti.merge(df_qc, left_index=True, right_index=True, how="left")
    df_nifti.drop("scan_site_id", axis="columns", inplace=True)
    df_nifti.rename(columns={"fibr + qsiprep rating": "labels"}, inplace=True)

    filepaths = list(df_nifti.itertuples(index=False, name=None))
    
    n_channels = {
        "b0-colorfa-rgb": 3,
        "combined": 4,
        "b0-tensorfa-dwiqc": 5,
    }
    
    # Verify that all volumes have the same shape
    invalid = nobrainer.io.verify_features_labels(
        filepaths, volume_shape=(128, 128, 128, n_channels[s3_input_dir]),
        check_labels_int=False,
        check_labels_gte_zero=False,
    )
    print("Invalid:", invalid)
    assert not invalid    
    
    os.makedirs(local_tfrec_dir, exist_ok=True)

    nobrainer.tfrecord.write(
        features_labels=filepaths,
        filename_template=local_tfrec_dir + "/data-all_shard-{shard:03d}.tfrec",
        examples_per_shard=20
    )
    
    output_s3_dirs = {
        "b0-colorfa-rgb": "tfrecs/b0-colorfa-rgb-nosplit",
        "combined": "tfrecs/b0-colorfa-4channel-nosplit",
        "b0-tensorfa-dwiqc": "tfrecs/b0-tensorfa-dwiqc-nosplit"
    }
    
    df_nifti.to_csv(op.join(local_tfrec_dir, "filepaths.csv"))

    fs = S3FileSystem()
    fs.put(
        local_tfrec_dir,
        f"hbn-pod2-deep-learning/{output_s3_dirs[s3_input_dir]}",
        recursive=True
    )

In [4]:
di = ck.DockerImage(
    func=create_tfrecs,
    base_image="python:3.8",
    github_installs="https://github.com/richford/nobrainer.git@enh/four-d",
    overwrite=True
)



In [5]:
di.build(tags=["hbn-pod2-tfrecs-20210908"])

In [6]:
repo = ck.aws.DockerRepo(name=ck.get_ecr_repo())

In [7]:
# The very first time you run this, this command could take a few minutes
di.push(repo=repo)

In [9]:
# Specify bid_percentage to use Spot instances
# And make sure the volume size is large enough. 55-60 GB seems about right for HBN preprocessing. YMMV.
knot = ck.Knot(
    name="hbn-pod2-tfrecs-20210917-0",
    docker_image=di,
    pars_policies=("AmazonS3FullAccess",),
    bid_percentage=100,
    memory=8000,
    job_def_vcpus=8,
    volume_size=100,
    max_vcpus=64,
    retries=1,
    aws_resource_tags={"Project": "HBN-FCP-INDI"},
)

In [10]:
results = knot.map(input_dirs)

In [11]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
d95f98ad-22a8-4d7e-8a7d-62003b860776        hbn-pod2-tfrecs-20210917-0-0        SUBMITTED


In [12]:
knot.clobber(clobber_pars=True)