In [2]:
import sys
import os
import os.path as op
import datetime
from glob import glob
import numpy as np
import pandas as pd

In [3]:
# Define directory paths
PATHS = {
    "proj": "/mnt/coredata/processing/leads",
}
PATHS["data"] = op.join(PATHS["proj"], "data")
PATHS["freesurfer"] = op.join(PATHS["data"], "freesurfer")
PATHS["metadata"] = op.join(PATHS["proj"], "metadata")
PATHS["raw"] = op.join(PATHS["data"], "raw")
PATHS["processed"] = op.join(PATHS["data"], "processed")

In [7]:
def scrape_raw(raw_dir, verbose=True):
    """Scrape raw directory for all nifti files and parse them.

    Returns a pandas DataFrame with columns:
    - subj: subject ID
    - scan_date: scan date (YYYY-MM-DD)
    - scan_type: scan type (MRI modality or PET tracer)
    - rawf: full path to the nifti file in raw
    """
    raw_scans = glob(op.join(raw_dir, "**", "*.nii"), recursive=True)

    output = []
    for scanf in raw_scans:
        subj = _get_subj(scanf)
        scan_date = _get_scan_date(scanf)
        scan_type = _get_scan_type(scanf)

        # Check if any are None
        output.append([subj, scan_date, scan_type, scanf])

    cols = ["subj", "scan_date", "scan_type", "rawf"]
    output = pd.DataFrame(output, columns=cols)

    if verbose:
        if len(output) == 0:
            print("No scans found in raw")
        else:
            print(
                f"Found {len(output)} scans from {output['subj'].nunique()} subjects in {raw_dir}"
            )
    return output


def _get_subj(filepath):
    """Return the subject ID from filepath to the recon'd nifti.

    Parameters
    ----------
    filepath : str
        The filepath to the reconstructed nifti.

    Returns
    -------
    subj : str
        The subject ID parsed from the input file basename.
    """
    try:
        subj = filepath.replace(PATHS["raw"] + "/", "").split("/")[1]
        if len(subj) > 0:
            return subj
        else:
            return None
    except IndexError:
        return None


def _get_scan_date(filepath):
    """Return the scan date from filepath to the recon'd nifti.

    Iterates over filepath directories from right to left until it finds
    a filename or directory whose first 10 characters matches the date
    format YYYY-MM-DD.

    Returns None if no scan date is found, otherwise a string like
    'YYYY-MM-DD'.
    """
    for d in filepath.split(op.sep)[::-1]:
        try:
            acqdate = datetime.datetime.strptime(d[:10], "%Y-%m-%d").strftime(
                "%Y-%m-%d"
            )
            return acqdate
        except ValueError:
            pass
    return None


def _get_scan_type(filepath, scan_type_map_file=None):
    """Parse the filepath and return the scan type."""
    if scan_type_map_file is None:
        scan_type_map_file = op.join(
            PATHS["metadata"], "ssheets", "scan_types_and_tracers.csv"
        )
    scan_type_map = (
        pd.read_csv(scan_type_map_file).set_index("name_in")["name_out"].to_dict()
    )
    basename = op.basename(filepath).lower()
    for k, v in scan_type_map.items():
        if k in basename:
            return v
    return None

In [12]:
raw_scans = scrape_raw(PATHS["raw"])

# Get paths for the raw nifti files copied into the processed directory
raw_cp_str = op.join(
    PATHS["processed"],
    "{subj}",
    "{scan_type}_{scan_date}",
    "{subj}_{scan_type}_{scan_date}.nii",
)
raw_scans["raw_cp"] = raw_scans.apply(
    lambda x: raw_cp_str.format(
        subj=x["subj"], scan_type=x["scan_type"], scan_date=x["scan_date"]
    ),
    axis=1,
)

Found 12 scans from 1 subjects in /mnt/coredata/processing/leads/data/raw


In [16]:
raw_scans.iloc[-1]["raw_cp"]

'/mnt/coredata/processing/leads/data/processed/LDS0370008/FBB_2021-11-16/LDS0370008_FBB_2021-11-16.nii'

In [None]:
row = raw_scans.iloc[0]
# .format(subj=row["subj"], scan_type=row["scan_type"], scan_date=row["scan_date"])

In [11]:
# Add processed file names to output
processed_filenames = {"MRI_T1": {"raw_cp": op.join(
    PATHS["processed"],
    "{subj}",
    "{scan_type}_{scan_date}",
    "{subj}_{scan_type}_{scan_date}.nii",
)},
"FBB": {"raw_cp": op.join(

'/mnt/coredata/processing/leads/data/processed/LDS0370008/MRI-T1_2020-10-29/LDS0370008_MRI-T1_2020-10-29.nii'

In [37]:
scan_type_map_file = op.join(PATHS["metadata"], "ssheets", "scan_types_and_tracers.csv")
scan_type_map = (
    pd.read_csv(scan_type_map_file).set_index("name_in")["name_out"].to_dict()
)
scan_type_map

{'fbb': 'FBB',
 'florbetaben': 'FBB',
 'neuraceq': 'FBB',
 'amyvid': 'FBP',
 'av-45': 'FBP',
 'av45': 'FBP',
 'fbp': 'FBP',
 'florbetapir': 'FBP',
 'fdg': 'FDG',
 'fludeoxyglucose': 'FDG',
 'fluorodeoxyglucose': 'FDG',
 'flutametamol': 'FLUTE',
 'flute': 'FLUTE',
 'av-1451': 'FTP',
 'av1451': 'FTP',
 'flortaucipir': 'FTP',
 'ftp': 'FTP',
 'tauvid': 'FTP',
 'mk-6240': 'MK6240',
 'mk6240': 'MK6240',
 'ir-fspgr': 'MRI_T1',
 'mprage': 'MRI_T1',
 't1': 'MRI_T1',
 'azd-4694': 'NAV',
 'azd4694': 'NAV',
 'flutafuranol': 'NAV',
 'nav-4694': 'NAV',
 'nav4694': 'NAV',
 'pi-2620': 'PI2620',
 'pi2620': 'PI2620',
 'pib': 'PIB',
 'pittsburgh compound b': 'PIB',
 'pittsburgh compound-b': 'PIB'}

0

In [11]:
new_scans

['/mnt/coredata/processing/leads/data/newdata/mri/LDS0370008/Accelerated_Sagittal_MPRAGE/2020-10-29_10_57_25.0/I1357578/I1357578_Accelerated_Sagittal_MPRAGE_20201029105725_2.nii',
 '/mnt/coredata/processing/leads/data/newdata/mri/LDS0370008/Accelerated_Sagittal_MPRAGE/2019-09-19_08_04_52.0/I1228542/I1228542_Accelerated_Sagittal_MPRAGE_20190919080453_2.nii',
 '/mnt/coredata/processing/leads/data/newdata/mri/LDS0370008/Accelerated_Sagittal_MPRAGE/2021-11-03_09_08_21.0/I1511888/I1511888_Accelerated_Sagittal_MPRAGE_20211103090821_2.nii',
 '/mnt/coredata/processing/leads/data/newdata/mri/LDS0370008/Accelerated_Sagittal_MPRAGE/2018-08-15_11_08_29.0/I1035748/I1035748_Accelerated_Sagittal_MPRAGE_20180815110830_2.nii',
 '/mnt/coredata/processing/leads/data/newdata/ftp/LDS0370008/AV1451_Coreg,_Avg,_Std_Img_and_Vox_Siz,_Uniform_6mm_Res/2020-10-29_16_34_02.0/I1778293/I1778293_AV1451_Coreg,_Avg,_Std_Img_and_Vox_Siz,_Uniform_6mm_Res_20201029163402_5.nii',
 '/mnt/coredata/processing/leads/data/newdat