In [5]:
import AFQ.data as afqd
import cloudknot as ck
import importlib
import s3fs
import json
import os.path as op
import numpy as np
import pandas as pd

In [37]:
study = afqd.S3BIDSStudy(
    "hbn_curated-0",
    bucket="fcp-indi",
    s3_prefix="data/Projects/HBN/BIDS_curated",
    subjects=1,
)

qsiprep_study = afqd.S3BIDSStudy(
    "hbn_curated_qsiprep-0",
    bucket="fcp-indi",
    s3_prefix="data/Projects/HBN/BIDS_curated/derivatives/qsiprep",
    subjects=1,
)

Retrieving subject S3 keys
[########################################] | 100% Completed |  2.7s
Retrieving subject S3 keys
[########################################] | 100% Completed |  0.6s


In [38]:
print(len(study._all_subjects))
print(len(qsiprep_study._all_subjects))

2615
1653


In [4]:
remaining_subs = list(set(study._all_subjects) - set(qsiprep_study._all_subjects))
print(len(remaining_subs))

962


In [6]:
mismatch_df = pd.read_csv("/Users/richford/Desktop/curation_dwi_mismatch.csv")
mismatch_df.head()

Unnamed: 0,participant_id,site,raw_nifti,raw_json,raw_bval,raw_bvec
0,sub-NDARHU910KZC,Site-CBIC,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...
1,sub-NDARGM610LF0,Site-SI,s3://fcp-indi/data/Projects/HBN/MRI/Site-SI/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-SI/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-SI/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-SI/su...
2,sub-NDARPL501ZUU,Site-CBIC,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...
3,sub-NDAREC078VFT,Site-RU,s3://fcp-indi/data/Projects/HBN/MRI/Site-RU/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-RU/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-RU/su...,s3://fcp-indi/data/Projects/HBN/MRI/Site-RU/su...
4,sub-NDARLH263KCL,Site-CBIC,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...,s3://fcp-indi/data/Projects/HBN/MRI/Site-CBIC/...


In [30]:
print("union", len(set(remaining_subs) & set(mismatch_df["participant_id"])))
print("remaining, not mismatched", len(set(remaining_subs) - set(mismatch_df["participant_id"])))

union 512
remaining, not mismatched 450


## Define the preprocessing function

In [12]:
def preprocess_hbn(subject_id):
    import AFQ.data as afqd
    import os
    import subprocess
    from s3fs import S3FileSystem
    
    # The following code snippet retrieves access credentials for the fcp-indi bucket that are stored in an AWS secret.
    import boto3
    import json

    secret_name = "hbn/fcp-indi/access"
    region_name = "us-west-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )
    # Decrypts secret using the associated KMS CMK.
    secret = json.loads(get_secret_value_response['SecretString'])
    
    local_dir = "./hbn"
    local_output_dir = "./hbn-preproc"
    bucket = "fcp-indi"
    s3_prefix = "data/Projects/HBN/BIDS_curated"
    
    study = afqd.S3BIDSStudy(
        "hbn_curated",
        bucket=bucket,
        s3_prefix=s3_prefix,
        subjects=[subject_id],
        anon=True,
    )
    
    study.download(local_dir)
    fs = S3FileSystem(
        key=secret["fcp_indi_aws_access_key_id"],
        secret=secret["fcp_indi_aws_secret_access_key"]
    )
    
    # HBN has other files that we don't need for dMRI preproc and whose presence will confuse qsiprep
    # Get rid of them
    s0 = study.subjects[0]
    dwi_files = [file for key, file in s0.files["raw"].items() if "/dwi/" in key]
    fmri_files = [file for key, file in s0.files["raw"].items() if "/func/" in key]
    fmri_files += [file for key, file in s0.files["raw"].items() if "/fmap/" in key and "acq-fMRI" in key]
    
    for fname in fmri_files:
        os.remove(fname)
            
    command = [
        "qsiprep",
        "--output-resolution",
        "1.8",
        "--participant-label",
        subject_id,
        "-w",
        "./hbn-wrk",
        "--nthreads",
        "8",
        "--omp-nthreads",
        "8",
        "--dwi-denoise-window",
        "5",
        "--unringing-method",
        "mrdegibbs",
        local_dir,
        local_output_dir,
        "participant",
    ]
    
    if dwi_files:
        response = subprocess.run(command, check=True)
        output_dir = "/".join([bucket, s3_prefix, "derivatives", "qsiprep"])
        fs.put(f"{local_output_dir}/qsiprep/{subject_id}",
               "/".join([output_dir, subject_id]), recursive=True)
        fs.put(f"{local_output_dir}/qsiprep/{subject_id}.html",
               "/".join([output_dir, subject_id + ".html"]))

        return {subject_id: True}
    else:
        return {subject_id: False}

# Create a cloudknot DockerImage instance

This Docker image was previously created for the initial production runs. Instead of creating a new one, retrieve the information from the cloudknot config file. If you haven't done this previously on your local machine, you'll have to set `recover_from_config = False`.

In [19]:
recover_from_config = True

In [20]:
if not recover_from_config:
    di = ck.DockerImage(
        name="preprocess-hbn-curated",
        func=preprocess_hbn,
        base_image="qsiprep:direct-0.12.1",
        github_installs=["https://github.com/yeatmanlab/pyAFQ.git@master",
                         "https://github.com/matplotlib/matplotlib.git@v2.2.3",
                         "https://github.com/bids-standard/pybids.git@0.9.3"],
        overwrite=True,
    )
else:
    di = ck.DockerImage(name="preprocess-hbn-curated")



## Build, tag, and push the Docker image

In [21]:
di.repo_uri

'454929164628.dkr.ecr.us-west-2.amazonaws.com/cloudknot:hbn-preproc-curated'

In [22]:
if not recover_from_config:
    di.build(tags=["hbn-preproc-curated"])

In [23]:
if not recover_from_config:
    repo = ck.aws.DockerRepo(name=ck.get_ecr_repo())

In [24]:
if not recover_from_config:
    print(repo.repo_uri)

In [25]:
if not recover_from_config:
    # The very first time you run this, this command could take
    # a few hours because the docker image is large
    di.push(repo=repo)

## Create the Knots

In [26]:
# Specify bid_percentage to use Spot instances
# And make sure the volume size is large enough. 50-55 GB seems about right for HBN preprocessing. YMMV.
# Also be sure to set the Project tag in ``aws_resource_tags`` for billing transparency
knot = ck.Knot(
    name=f"qsiprep-hbn-curated-3",
    docker_image=di,
    pars_policies=('AmazonS3FullAccess','AllowFcpIndiKeyAccess'),
    bid_percentage=100,
    memory=64000,
    job_def_vcpus=8,
    volume_size=90,
    max_vcpus=8192,
    retries=3,
    aws_resource_tags={"Project": "HBN-FCP-INDI"},
)

## Submit the jobs and check on results

In [31]:
first_50_futures = knot.map(remaining_subs[50:])

Argh, that was silly. That's all the subjects except the first 50. Oh well, let's see how it's running

In [33]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
6a1a3303-07a0-4bee-b4eb-a053854255ce        qsiprep-hbn-curated-3-0        PENDING  


In [34]:
print(remaining_subs[:50])

['sub-NDAREW430AYU', 'sub-NDARHN078CDT', 'sub-NDAREM141CKP', 'sub-NDARRE333EKT', 'sub-NDARYZ770NA1', 'sub-NDARLH043YDK', 'sub-NDARKA946MJ1', 'sub-NDARFB500HHN', 'sub-NDARKN175HWB', 'sub-NDARCA153NKE', 'sub-NDARNN218UGY', 'sub-NDARNE800DCT', 'sub-NDARAK019ZR6', 'sub-NDAREC647MKW', 'sub-NDARNL599TMZ', 'sub-NDARKC978MR4', 'sub-NDARRX084UML', 'sub-NDARBM173BJG', 'sub-NDARXU679ZE8', 'sub-NDARXU018RGY', 'sub-NDARWV155PRG', 'sub-NDARUT651WFC', 'sub-NDARJF755DT9', 'sub-NDARHR763RB4', 'sub-NDARML148UCE', 'sub-NDARYL771XDP', 'sub-NDARUF935UL3', 'sub-NDARGJ627BL2', 'sub-NDARAM277WZT', 'sub-NDARFV780ABD', 'sub-NDARXK893KLX', 'sub-NDARMW178UDD', 'sub-NDARRZ940HX6', 'sub-NDARLA226ADX', 'sub-NDARRK163VY8', 'sub-NDARCK661RZ6', 'sub-NDARNE511XHU', 'sub-NDARCJ475WJP', 'sub-NDARGU100JH4', 'sub-NDARMM905VYR', 'sub-NDARRU820CXW', 'sub-NDARWR732NZE', 'sub-NDARAA075AMK', 'sub-NDARYH996DA9', 'sub-NDARGH775KF5', 'sub-NDARMM431GVE', 'sub-NDARRP592GHK', 'sub-NDARXT792GY8', 'sub-NDARFL506HVX', 'sub-NDARFV725DEQ']

In [35]:
remaining_futures = knot.map(remaining_subs[:50])

In [36]:
knot.view_jobs()

Job ID              Name                        Status   
---------------------------------------------------------
521c737d-4c32-4d80-b9dd-f52861f63fba        qsiprep-hbn-curated-3-1        PENDING  
6a1a3303-07a0-4bee-b4eb-a053854255ce        qsiprep-hbn-curated-3-0        PENDING  


## Results

The results are dicts where the keys are the subject IDs and the values report sucess or failure

In [None]:
result_futures.result()

## Figure out how many subjects we have in the entire study to support some cost estimates

In [None]:
all_sites = {}

for site in ["Site-SI", "Site-CBIC", "Site-RU", "Site-CUNY"]:
    all_sites[site] = afqd.HBNSite(site=site)
    print(f"{site}: {len(all_sites[site]._all_subjects)}")

## When you're done, clobber the knot

In [None]:
knot.clobber(clobber_pars=True)