Import the required libraries

In [1]:
import AFQ.data as afqd
import os.path as op
import numpy as np
import pandas as pd
import seaborn as sns

  import pandas.util.testing as tm


Load information on the subjects and their AWS S3 keys in the "raw" HBN datasets.
This will take something like 16 minutes.

In [2]:
sites = ["Site-SI", "Site-RU", "Site-CBIC", "Site-CUNY"]
hbn_orig = {}

for site in sites:
    hbn_orig[site] = afqd.HBNSite(site, subjects="all")



Retrieving subject S3 keys
[########################################] | 100% Completed |  2min 55.8s
Retrieving subject S3 keys
[########################################] | 100% Completed |  8min 49.1s
Retrieving subject S3 keys
[########################################] | 100% Completed | 10min 27.8s
Retrieving subject S3 keys
[########################################] | 100% Completed | 33.2s


Now load the curated dataset and the qsiprep output as two separate datasets. This part will take something like 20 minutes.

In [3]:
curated = afqd.S3BIDSStudy(
    "hbn_curated-0",
    bucket="fcp-indi",
    s3_prefix="data/Projects/HBN/BIDS_curated",
    subjects="all",
)

qsiprepped = afqd.S3BIDSStudy(
    "hbn_curated_qsiprep-0",
    bucket="fcp-indi",
    s3_prefix="data/Projects/HBN/BIDS_curated/derivatives/qsiprep",
    # no need to use subjects="all" here since we won't need the S3 keys
)

Retrieving subject S3 keys
[########################################] | 100% Completed | 22min 36.6s
Retrieving subject S3 keys
[########################################] | 100% Completed |  0.5s


Print the total number of subjects in each "raw" site and also in the curated and qsiprep datasets

In [4]:
for site in sites:
    print(f"{site:10s}: {len(hbn_orig[site]._all_subjects)}")
    
print(f"curated   : {len(curated._all_subjects)}")
print(f"qsiprepped: {len(qsiprepped._all_subjects)}")

Site-SI   : 343
Site-RU   : 1227
Site-CBIC : 1081
Site-CUNY : 96
curated   : 2615
qsiprepped: 2136


So it looks like we have 2,747 subjects in the "raw" sites, 2,615 subjects in the `BIDS_curated` dataset, and 2,136 subjects that have successfully gone through qsiprep.

## Build pandas dataframes

Now we're going to build pandas DataFrames that contain the AWS S3 keys for each subject's imaging files. We will require only the following files:

- anat
  - T1w.nii.gz
  - T1w.json
- dwi
  - dwi.nii.gz
  - dwi.json
  - dwi.bval
  - dwi.bvec

And we will exclude any files with "TRACEW" in the name.

In [5]:
def get_key(list_of_keys, modality, suffix):
    """Get an S3 for a specific imaging file
    
    Parameters
    ----------
    list_of_keys : list
        List of AWS S3 keys
        
    modality : "anat" or "dwi"
        The imaging modality
        
    suffix : str
        The image file suffix, e.g. dwi.nii.gz, dwi.bvec, or T1w.nii.gz
    
    Returns
    -------
    AWS S3 key (type str) or np.nan if the key wasn't found
    """
    if modality not in ["anat", "dwi"]:
        pass
    
    key = [
        k for k in list_of_keys
        if "/" + modality + "/" in k and k.endswith(suffix) and "TRACE" not in k
    ]
    
    if len(key):
        return key[0]
    else:
        return np.nan

def get_raw_df(site):
    """Build a pandas dataframe with AWS S3 keys for the raw dataset
    
    The dataframe will have 'participant_id' as a the index
    and columns for raw dwi and anat files.
    
    Returns
    -------
    pd.DataFrame
    """
    dicts = [
        {
            "participant_id": sub.subject_id.replace("sub-", ""),
            "raw_site": sub.site,
            "raw_dwi_nifti": get_key(sub.s3_keys["raw"], "dwi", "dwi.nii.gz"),
            "raw_dwi_json": get_key(sub.s3_keys["raw"], "dwi", "dwi.json"),
            "raw_dwi_bval": get_key(sub.s3_keys["raw"], "dwi", "dwi.bvec"),
            "raw_dwi_bvec": get_key(sub.s3_keys["raw"], "dwi", "dwi.bval"),
            "raw_t1w_nifti": get_key(sub.s3_keys["raw"], "anat", "T1w.nii.gz"),
            "raw_t1w_json": get_key(sub.s3_keys["raw"], "anat", "T1w.json"),
        } for sub in site.subjects
    ]
    
    return pd.DataFrame(dicts).set_index("participant_id", drop=True)

def get_curated_df(study):
    """Build a pandas dataframe with AWS S3 keys for the curated dataset
    
    The dataframe will have 'participant_id' as a the index
    and columns for raw dwi and anat files.
    
    Returns
    -------
    pd.DataFrame
    """
    dicts = [
        {
            "participant_id": sub.subject_id.replace("sub-", ""),
            "curated_site": sub.s3_keys["raw"][0].split(sub.subject_id)[1].replace("/ses-HBNsite", "Site-")[:7],
            "curated_dwi_nifti": get_key(sub.s3_keys["raw"], "dwi", "dwi.nii.gz"),
            "curated_dwi_json": get_key(sub.s3_keys["raw"], "dwi", "dwi.json"),
            "curated_dwi_bval": get_key(sub.s3_keys["raw"], "dwi", "dwi.bvec"),
            "curated_dwi_bvec": get_key(sub.s3_keys["raw"], "dwi", "dwi.bval"),
            "curated_t1w_nifti": get_key(sub.s3_keys["raw"], "anat", "T1w.nii.gz"),
            "curated_t1w_json": get_key(sub.s3_keys["raw"], "anat", "T1w.json"),
        } for sub in study.subjects
    ]
    
    return pd.DataFrame(dicts).set_index("participant_id", drop=True)

Create the "raw" dataframe

In [6]:
df_raw = pd.concat({
    site: get_raw_df(site_study) for site, site_study in hbn_orig.items()
}.values())
df_raw["participant_id_upper"] = [s.upper() for s in df_raw.index]
df_raw.head()

Unnamed: 0_level_0,raw_site,raw_dwi_nifti,raw_dwi_json,raw_dwi_bval,raw_dwi_bvec,raw_t1w_nifti,raw_t1w_json,participant_id_upper
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDARJR525HTX,Site-SI,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,data/Projects/HBN/MRI/Site-SI/sub-NDARJR525HTX...,NDARJR525HTX
NDARXB704HFD,Site-SI,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,data/Projects/HBN/MRI/Site-SI/sub-NDARXB704HFD...,NDARXB704HFD
NDARMM905VYR,Site-SI,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,data/Projects/HBN/MRI/Site-SI/sub-NDARMM905VYR...,NDARMM905VYR
NDARML148UCE,Site-SI,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,data/Projects/HBN/MRI/Site-SI/sub-NDARML148UCE...,NDARML148UCE
NDARED047DTH,Site-SI,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,data/Projects/HBN/MRI/Site-SI/sub-NDARED047DTH...,NDARED047DTH


Create the "curated" dataframe.

In [7]:
df_curated = get_curated_df(curated)
df_curated["participant_id_upper"] = [s.upper() for s in df_curated.index]
df_curated.head()

Unnamed: 0_level_0,curated_site,curated_dwi_nifti,curated_dwi_json,curated_dwi_bval,curated_dwi_bvec,curated_t1w_nifti,curated_t1w_json,participant_id_upper
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDARHZ413DZL,Site-CB,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,data/Projects/HBN/BIDS_curated/sub-NDARHZ413DZ...,NDARHZ413DZL
NDARMN695VFH,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDARMN695VF...,data/Projects/HBN/BIDS_curated/sub-NDARMN695VF...,data/Projects/HBN/BIDS_curated/sub-NDARMN695VF...,data/Projects/HBN/BIDS_curated/sub-NDARMN695VF...,data/Projects/HBN/BIDS_curated/sub-NDARMN695VF...,,NDARMN695VFH
NDARYU898ETS,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,data/Projects/HBN/BIDS_curated/sub-NDARYU898ET...,NDARYU898ETS
NDARZJ016GL0,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,data/Projects/HBN/BIDS_curated/sub-NDARZJ016GL...,NDARZJ016GL0
NDARHN067BTE,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDARHN067BT...,data/Projects/HBN/BIDS_curated/sub-NDARHN067BT...,data/Projects/HBN/BIDS_curated/sub-NDARHN067BT...,data/Projects/HBN/BIDS_curated/sub-NDARHN067BT...,data/Projects/HBN/BIDS_curated/sub-NDARHN067BT...,,NDARHN067BTE


## Missing "raw" subjects

Let's see if there are any subjects in the `BIDS_curated` dataset that are not in the "raw" site data

In [8]:
print(set(df_curated.index) - set(pd.unique(df_raw.index)))

{'NDARPU329MDJ', 'NDARUH712NRU', 'NDARDX857DLB', 'NDARHD952XHH', 'NDARCR594JN0', 'ndarvn280jtn'}


Oddly, there are six subjects in the curated dataset that aren't in the raw data. But one of these has a participant ID that is all lowercase. Let's check again with case insensitivity

In [9]:
missing_raw_subs = set(df_curated["participant_id_upper"]) - set(pd.unique(df_raw["participant_id_upper"]))
print(missing_raw_subs)
np.savetxt("missing_raw_subs.txt", np.array(list(missing_raw_subs)), fmt="%s")

{'NDARPU329MDJ', 'NDARUH712NRU', 'NDARDX857DLB', 'NDARHD952XHH', 'NDARCR594JN0'}


Okay, so we actually have two different problems here.

1. There are five subjects in the `BIDS_curated` dataset that are not in the "raw" sites. We should figure out where the raw data is and put it up on FCP-INDI.
1. There is one subject who is represented twice in the `BIDS_curated` dataset, once with all caps and once with lowercase letters. We should figure out which one of these two is the correct one and delete the other.

## Missing curated subjects

Now let's see which subjects are represented in the "raw" dataset but missing in the `BIDS_curated` dataset

In [10]:
missing_curated_subs = set(df_raw.index) - set(df_curated.index)
print(len(missing_curated_subs))
np.savetxt("missing_curated_subs.txt", np.array(list(missing_curated_subs)), fmt="%s")

138


### Missing curated subjects with DWI

We're particularly interested in subjects that are missing from the curated datasets but who have the minimal set of `dwi` and `anat` files. So let's merge the two dataframes to compare subjects that are in the "raw" data but not in the curated data.

In [11]:
df_hbn = df_raw.merge(df_curated, how="outer", left_index=True, right_index=True, sort=True)
df_hbn["qsiprep_success"] = ["sub-" + sub in qsiprepped._all_subjects for sub in df_hbn.index]
df_hbn["curated_site"] = df_hbn["curated_site"].map({
    "Site-SI": "Site-SI",
    "Site-RU": "Site-RU",
    "Site-CB": "Site-CBIC",
    "Site-CU": "Site-CUNY",
})

# We want participants who at least have raw dwi and anat file,
# so let's drop the others from the dataframe

# Drop subjects who are missing any of the required raw files
df_hbn.dropna(subset=[
    "raw_dwi_nifti", "raw_dwi_json", "raw_dwi_bval",
    "raw_dwi_bvec", "raw_t1w_nifti", "raw_t1w_json"
], inplace=True)

# Drop the columns for upper case subject ID since we're done with those
df_hbn.drop(["participant_id_upper_x", "participant_id_upper_y"], axis="columns", inplace=True)

# Create a new binary column for whether of not the subject is in the curated dataset
df_hbn["in_bids_curated"] = ["sub-" + sub in curated._all_subjects for sub in df_hbn.index]

# Create a new dataframe for subjects that have the required raw
# files but are missing from the curated dataset
not_curated_with_raw_files = df_hbn[df_hbn["in_bids_curated"] == False]

print(len(not_curated_with_raw_files))
np.savetxt(
    "missing_curated_subs_that_have_raw_dwi.txt",
    np.array(not_curated_with_raw_files.index),
    fmt="%s"
)

106


So there are 106 subjects that have the minimal dwi and anat files in the raw dataset but are missing from the curated dataset.

## Curated subjects that failed qsiprep

Lastly, let's get a list of subjects that are in the curated dataset but who did not successfully finish preprocessing with QSIprep.

In [12]:
in_curated = df_hbn[df_hbn["in_bids_curated"] == True]
no_qsiprep = in_curated[in_curated["qsiprep_success"] == False]
print(len(no_qsiprep))
np.savetxt(
    "failed_qsiprep_subs.txt",
    np.array(no_qsiprep.index),
    fmt="%s"
)

55


So there are 55 subjects that failed QSIprep. We saved their subject IDs to a text file. Now let's actually look at the dataframe.

In [13]:
no_qsiprep

Unnamed: 0_level_0,raw_site,raw_dwi_nifti,raw_dwi_json,raw_dwi_bval,raw_dwi_bvec,raw_t1w_nifti,raw_t1w_json,curated_site,curated_dwi_nifti,curated_dwi_json,curated_dwi_bval,curated_dwi_bvec,curated_t1w_nifti,curated_t1w_json,qsiprep_success,in_bids_curated
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
NDARAA504CRN,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAA504C...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,data/Projects/HBN/BIDS_curated/sub-NDARAA504CR...,False,True
NDARAB756JDJ,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAB756J...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,data/Projects/HBN/BIDS_curated/sub-NDARAB756JD...,False,True
NDARAH976BND,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAH976B...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,data/Projects/HBN/BIDS_curated/sub-NDARAH976BN...,False,True
NDARAJ977PRJ,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARAJ977P...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,data/Projects/HBN/BIDS_curated/sub-NDARAJ977PR...,False,True
NDARBM490LK7,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,data/Projects/HBN/MRI/Site-RU/sub-NDARBM490LK7...,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,data/Projects/HBN/BIDS_curated/sub-NDARBM490LK...,False,True
NDARBN365EV3,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,,False,True
NDARBR128UFP,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBR128U...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,data/Projects/HBN/BIDS_curated/sub-NDARBR128UF...,False,True
NDAREC078VFT,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,data/Projects/HBN/MRI/Site-RU/sub-NDAREC078VFT...,Site-RU,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,data/Projects/HBN/BIDS_curated/sub-NDAREC078VF...,False,True
NDAREJ327AH7,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,,False,True
NDARFE372VHL,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARFE372V...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,data/Projects/HBN/BIDS_curated/sub-NDARFE372VH...,False,True


We see that some of these subjects are missing required curated files, even though those files are available in the raw dataset. Other failure modes are more mysterious and will require looking at the QSIprep logs. Let's find the ones with missing curated files.

In [14]:
len(no_qsiprep[no_qsiprep.isna().any(axis="columns")])

13

In [15]:
df_hbn_curated = df_hbn[df_hbn["in_bids_curated"] == True]
df_curation_mismatch = df_hbn_curated[df_hbn_curated.isna().any(axis="columns")]
print(len(df_curation_mismatch))


725


In [16]:
np.savetxt(
    "curation_file_mismatch_subs.txt",
    np.array(df_curation_mismatch.index),
    fmt="%s"
)

In [18]:
df_curation_mismatch[df_curation_mismatch["qsiprep_success"] == False]

Unnamed: 0_level_0,raw_site,raw_dwi_nifti,raw_dwi_json,raw_dwi_bval,raw_dwi_bvec,raw_t1w_nifti,raw_t1w_json,curated_site,curated_dwi_nifti,curated_dwi_json,curated_dwi_bval,curated_dwi_bvec,curated_t1w_nifti,curated_t1w_json,qsiprep_success,in_bids_curated
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
NDARBN365EV3,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARBN365E...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,data/Projects/HBN/BIDS_curated/sub-NDARBN365EV...,,False,True
NDAREJ327AH7,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,data/Projects/HBN/MRI/Site-CBIC/sub-NDAREJ327A...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,data/Projects/HBN/BIDS_curated/sub-NDAREJ327AH...,,False,True
NDARFV780ABD,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,data/Projects/HBN/MRI/Site-RU/sub-NDARFV780ABD...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARFV780AB...,data/Projects/HBN/BIDS_curated/sub-NDARFV780AB...,data/Projects/HBN/BIDS_curated/sub-NDARFV780AB...,,False,True
NDARGF367KVL,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,data/Projects/HBN/MRI/Site-RU/sub-NDARGF367KVL...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARGF367KV...,data/Projects/HBN/BIDS_curated/sub-NDARGF367KV...,data/Projects/HBN/BIDS_curated/sub-NDARGF367KV...,data/Projects/HBN/BIDS_curated/sub-NDARGF367KV...,False,True
NDARGH790CEF,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,data/Projects/HBN/MRI/Site-RU/sub-NDARGH790CEF...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARGH790CE...,data/Projects/HBN/BIDS_curated/sub-NDARGH790CE...,data/Projects/HBN/BIDS_curated/sub-NDARGH790CE...,,False,True
NDARGX443CEU,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,data/Projects/HBN/MRI/Site-RU/sub-NDARGX443CEU...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARGX443CE...,data/Projects/HBN/BIDS_curated/sub-NDARGX443CE...,data/Projects/HBN/BIDS_curated/sub-NDARGX443CE...,,False,True
NDARKN509RP9,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,data/Projects/HBN/MRI/Site-RU/sub-NDARKN509RP9...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARKN509RP...,data/Projects/HBN/BIDS_curated/sub-NDARKN509RP...,data/Projects/HBN/BIDS_curated/sub-NDARKN509RP...,,False,True
NDARMH249AWF,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,data/Projects/HBN/MRI/Site-RU/sub-NDARMH249AWF...,Site-RU,,,data/Projects/HBN/BIDS_curated/sub-NDARMH249AW...,data/Projects/HBN/BIDS_curated/sub-NDARMH249AW...,data/Projects/HBN/BIDS_curated/sub-NDARMH249AW...,,False,True
NDARMT461DNB,Site-CBIC,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,data/Projects/HBN/MRI/Site-CBIC/sub-NDARMT461D...,Site-CBIC,data/Projects/HBN/BIDS_curated/sub-NDARMT461DN...,data/Projects/HBN/BIDS_curated/sub-NDARMT461DN...,data/Projects/HBN/BIDS_curated/sub-NDARMT461DN...,data/Projects/HBN/BIDS_curated/sub-NDARMT461DN...,data/Projects/HBN/BIDS_curated/sub-NDARMT461DN...,,False,True
NDARMU589LP6,Site-RU,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,data/Projects/HBN/MRI/Site-RU/sub-NDARMU589LP6...,Site-RU,,,,,data/Projects/HBN/BIDS_curated/sub-NDARMU589LP...,data/Projects/HBN/BIDS_curated/sub-NDARMU589LP...,False,True
