In [1]:
import os
import os.path as op
import datetime
from glob import glob
import numpy as np
import pandas as pd
import shutil

In [2]:
# Define directory paths
PATHS = {
    "proj": "/mnt/coredata/processing/leads",
}
PATHS["data"] = op.join(PATHS["proj"], "data")
PATHS["metadata"] = op.join(PATHS["proj"], "metadata")
PATHS["newdata"] = op.join(PATHS["proj"], "newdata")
PATHS["raw"] = op.join(PATHS["data"], "raw")
PATHS["processed"] = op.join(PATHS["data"], "processed")

In [3]:
def scrape_raw(raw_dir, verbose=True):
    """Scrape raw directory for all nifti files and parse them.

    Returns a pandas DataFrame with columns:
    - subj: subject ID
    - scan_date: scan date (YYYY-MM-DD)
    - scan_type: scan type (MRI modality or PET tracer)
    - raw_petf: full path to the nifti file in raw
    """
    raw_scans = glob(op.join(raw_dir, "**", "*.nii"), recursive=True)

    output = []
    for scanf in raw_scans:
        subj = _get_subj(scanf, raw_dir)
        scan_date = _get_scan_date(scanf)
        scan_type = _get_scan_type(scanf)
        output.append([subj, scan_date, scan_type, scanf])

    cols = ["subj", "scan_date", "scan_type", "raw_petf"]
    output = pd.DataFrame(output, columns=cols)

    # Add FDG to scan type for LONI files that don't save "FDG" in filename
    output.loc[output["scan_type"].isnull(), "scan_type"] = output.loc[
        output["scan_type"].isnull(), "raw_petf"
    ].apply(lambda x: "FDG" if (op.join(raw_dir, "fdg") in x) else np.nan)

    if verbose:
        if len(output) == 0:
            print("No scans found in raw")
        else:
            print(
                f"Found {len(output)} scans from {output['subj'].nunique()} subjects in {raw_dir}"
            )
    return output


def _get_subj(filepath, raw_dir):
    """Return the subject ID from filepath to the recon'd nifti.

    Parameters
    ----------
    filepath : str
        The filepath to the reconstructed nifti.

    Returns
    -------
    subj : str
        The subject ID parsed from the input file basename.
    """
    try:
        subj = filepath.replace(raw_dir + "/", "").split("/")[1]
        if len(subj) > 0:
            return subj
        else:
            return np.nan
    except IndexError:
        return np.nan


def _get_scan_date(filepath):
    """Return the scan date from filepath to the recon'd nifti.

    Iterates over filepath directories from right to left until it finds
    a filename or directory whose first 10 characters matches the date
    format YYYY-MM-DD.

    Returns np.nan if no scan date is found, otherwise a string like
    'YYYY-MM-DD'.
    """
    for d in filepath.split(op.sep)[::-1]:
        try:
            acqdate = check_dt_fmt(d[:10], raise_error=True)
            return acqdate
        except ValueError:
            pass
    return np.nan


def check_dt_fmt(datestr, raise_error=False):
    """Return datestr if formatted like YYYY-MM-DD.

    If raise_error is True, raise a ValueError if datestr is not
    formatted like YYYY-MM-DD. Otherwise return np.nan.
    """
    try:
        datestr_to_datetime(datestr)
        return datestr
    except ValueError:
        if raise_error:
            raise ValueError(f"{datestr} is not formatted like YYYY-MM-DD")
        else:
            return np.nan


def datestr_to_datetime(datestr):
    """Convert a date string to a datetime object."""
    return datetime.datetime.strptime(datestr, "%Y-%m-%d")


def datetime_to_datestr(dt):
    """Convert a datetime object to a date string."""
    return dt.strftime("%Y-%m-%d")


def _get_scan_type(filepath, scan_type_map_file=None):
    """Parse the filepath and return the scan type."""
    if scan_type_map_file is None:
        scan_type_map_file = op.join(
            PATHS["metadata"], "ssheets", "scan_types_and_tracers.csv"
        )
    scan_type_map = (
        pd.read_csv(scan_type_map_file).set_index("name_in")["name_out"].to_dict()
    )
    basename = op.basename(filepath).lower()
    for k, v in scan_type_map.items():
        if k in basename:
            return v
    return np.nan


def date_diff(date1, date2, abs=False):
    """Return date2 - date1 in days."""
    try:
        diff = (date2 - date1).days
        if abs:
            return np.abs(diff)
        else:
            return diff
    except TypeError:
        return np.nan


def find_closest_mri(
    subj, scan_date, freesurfer_dir, limit_days=365, strict_limit=False
):
    """Return closest MRI date, days from PET, and Freesurfer path.

    Parameters
    ----------
    subj : str
        The subject ID.
    scan_date : str
        Scan date (YYYY-MM-DD) to match the closest MRI scan to.
    freesurfer_dir : str
        Path to the top-level freesurfer directory containing individual
        processed MRI directories like <subj>_<scan_date>.
    limit_days : int
        A warning is raised if no MRI scan is found within limit days
        and np.nan is returned.
    strict_limit : bool
        If True, np.nan is returned if no MRI scan is found within limit days.
    """
    proc_mris = glob(op.join(freesurfer_dir, f"{subj}_*"))
    if len(proc_mris) == 0:
        print(
            f"WARNING: {subj} scan on {scan_date} has no processed MRI scans in {freesurfer_dir}"
        )
        return np.nan, np.nan, np.nan

    proc_mri_dates = [check_dt_fmt(op.basename(p).split("_")[1]) for p in proc_mris]

    days_to_scan = []
    for d in proc_mri_dates:
        days_to_scan.append(
            date_diff(datestr_to_datetime(d), datestr_to_datetime(scan_date), abs=True)
        )
    closest_mri = proc_mris[np.argmin(days_to_scan)]
    closest_mri_date = proc_mri_dates[np.argmin(days_to_scan)]
    min_days = min(days_to_scan)

    if min_days > limit_days:
        print(
            f"WARNING: {subj} scan on {scan_date} has no matching MRI within {limit_days} days"
        )
        if strict_limit:
            return np.nan, np.nan, np.nan

    return closest_mri_date, min_days, closest_mri


def now():
    """Return the current date and time down to seconds."""
    return datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")


def glob_sort_mtime(pattern):
    """Return files matching pattern in most recent modified order.

    Returns
    -------
    files : list of str
        List of files matching pattern, sorted by most recent modified
        (files[0] is the most recently modified).
    """
    files = sorted(glob(pattern), key=op.getmtime, reverse=True)
    return files

In [4]:
# Load the most recent raw_scans df
raw_scans = pd.read_csv(
    glob_sort_mtime(op.join(PATHS["metadata"], "log", f"raw_pet_scans_*.csv"))[0]
)
raw_scans = raw_scans.dropna().reset_index(drop=True)

print(f"raw_scans: {raw_scans.shape}")

raw_scans: (2067, 12)


In [91]:
search_raw_dirs = [
    d
    for d in glob(op.join(PATHS["raw"], "*"))
    if (op.isdir(d) and not (d.split(op.sep)[-1] == "mri"))
]
search_raw_dirs

['/mnt/coredata/processing/leads/data/raw/fbb',
 '/mnt/coredata/processing/leads/data/raw/fdg',
 '/mnt/coredata/processing/leads/data/raw/ftp']

In [173]:
# Scrape the raw directory for all PET niftis
raw_scans = scrape_raw(PATHS["raw"])

# Get paths for the raw nifti files copied into the processed directory
# raw_cp_str = op.join(
#     PATHS["processed"],
#     "{subj}",
#     "{scan_type}_{scan_date}",
#     "{subj}_{scan_type}_{scan_date}.nii",
# )

# Search processed Freesurfer dirs for closest MRI to each PET scan
raw_scans["mri_date"], raw_scans["days_to_mri"], raw_scans["fs_dir"] = zip(
    *raw_scans.apply(
        lambda x: find_closest_mri(x["subj"], x["scan_date"], PATHS["freesurfer"]),
        axis=1,
    )
)

print(f"raw_scans: {raw_scans.shape}")

Found 2069 scans from 619 subjects in /mnt/coredata/processing/leads/data/raw
raw_scans: (2069, 7)


In [22]:
fname = op.join(
    "/mnt/coredata/Projects/LEADS/data_f7p1/processed/LDS0370001/Timepoint2/MRI_T1_2020-10-14",
    "LDS0370001_MRI_T1_2020-10-14_nu.nii",
)
op.islink(fname)
freesurfer_scan_dir = op.dirname(op.dirname(op.abspath(os.readlink(fname))))

'/mnt/coredata/Projects/LEADS/data_f7p1/freesurfer_processing/LDS0370001_MRI_T1_2020-10-14'

In [174]:
# # Add processed MRI directories to raw_scans
# proc_dir_old = "/mnt/coredata/Projects/LEADS/data_f7p1/processed"
# mri_dirs = []
# proc_mri_dirs_old = []
# freesurfer_dirs_old = []
# for idx, scan in raw_scans.iterrows():
#     if scan["fs_dir"] is np.nan:
#         mri_dirs.append(np.nan)
#         continue

#     subj = scan["subj"]
#     mri_date = scan["fs_dir"].split("_")[1]
#     proc_mri_dir_new = op.join(PATHS["processed"], subj, f"MRI-T1_{mri_date}")

#     # Find the old processed MRI and FreeSurfer directories
#     max_tp = 6
#     for tp in range(1, max_tp + 1):
#         proc_mri_dir_old = op.join(
#             proc_dir_old, subj, f"Timepoint{tp}", f"MRI_T1_{mri_date}"
#         )
#         if op.isdir(proc_mri_dir_old):
#             os.makedirs(op.dirname(proc_mri_dir_new), exist_ok=True)
#             # if not op.exists(proc_mri_dir_new):
#             #     os.symlink(proc_mri_dir_old, proc_mri_dir_new)
#             if op.islink(proc_mri_dir_new):
#                 os.unlink(proc_mri_dir_new)
#             os.makedirs(proc_mri_dir_new, exist_ok=True)
#             proc_mri_dirs_old.append(proc_mri_dir_old)

#             # Find the FreeSurfer directory from the nu.nii symlink
#             nu_old = op.join(proc_mri_dir_old, f"{subj}_MRI_T1_{mri_date}_nu.nii")
#             if op.islink(nu_old):
#                 freesurfer_dir_old = op.dirname(
#                     op.dirname(op.abspath(os.readlink(fname)))
#                 )
#                 freesurfer_dir_new = op.join(proc_mri_dir_new, "freesurfer_7p1")
#                 freesurfer_link_new = op.join(proc_mri_dir_new, "freesurfer")
#                 shutil.copytree(freesurfer_scan_dir, proc_mri_dir_new)
#             break
#         if tp == max_tp:
#             proc_mri_dir_old = np.nan

#     # Add the processed MRI directory to raw_scans["mri_dir"]
#     if op.exists(proc_mri_dir_new):
#         mri_dirs.append(proc_mri_dir_new)
#     else:
#         mri_dirs.append(np.nan)

# raw_scans["mri_dir"] = mri_dirs

In [176]:
# Drop raw_scans rows with missing data, then sort rows.
raw_scans = (
    raw_scans.dropna()
    .sort_values(["subj", "scan_type", "scan_date"])
    .reset_index(drop=True)
)

# Convert days_to_mri to int
raw_scans["days_to_mri"] = raw_scans["days_to_mri"].astype(int)

# Add a visit column to raw_scans, with visit 1 being the earliest date
# for a given subject and scan_type, visit 2 being the next earliest
# date, and so on.
raw_scans["visit"] = raw_scans.groupby(["subj", "scan_type"]).cumcount() + 1
cols = raw_scans.columns.tolist()
cols.insert(cols.index("scan_type") + 1, cols.pop(cols.index("visit")))
raw_scans = raw_scans[cols]

print(f"raw_scans: {raw_scans.shape}")

raw_scans: (2067, 9)


In [177]:
# Add a diagnosis column to raw_scans
dxf = op.join(PATHS["metadata"], "ssheets", "LEADS_Internal_PET-Screening.xlsx")
if op.isfile(dxf):
    dx = pd.read_excel(dxf)
    dx_map = {"ID": "subj", "Cohort": "dx"}
    dx = dx.rename(columns=dx_map)[["subj", "dx"]]
    raw_scans = dx.merge(raw_scans, on="subj", how="right")

# Add controls.
subj_regf = op.join(
    PATHS["metadata"], "ssheets", "Participant Registration_vertical.csv"
)
if op.isfile(subj_regf):
    subj_reg = pd.read_csv(subj_regf)
    subj_reg_map = {"subject.label": "subj", "dd_revision_field.translated_value": "dx"}
    subj_reg = subj_reg.rename(columns=subj_reg_map)[["subj", "dx"]]
cn_subjs = subj_reg.query("(dx=='Cognitively Normal Participant')")["subj"].tolist()
raw_scans.loc[pd.isna(raw_scans["dx"]), "dx"] = raw_scans.loc[
    pd.isna(raw_scans["dx"]), "subj"
].apply(lambda x: "CN" if np.isin(x, cn_subjs) else np.nan)

In [178]:
# Find PET scans that are ready and needing to be processed
proc_pet_dirs = []
need_to_process = []
for idx, scan in raw_scans.iterrows():
    if scan["mri_dir"] is np.nan:
        proc_pet_dirs.append(np.nan)
        need_to_process.append(False)
        continue

    proc_pet_dir = op.join(
        PATHS["processed"], scan["subj"], f"{scan['scan_type']}_{scan['scan_date']}"
    )
    proc_pet_dirs.append(proc_pet_dir)
    need_to_process.append(not op.exists(proc_pet_dir))

raw_scans["proc_pet_dir"] = proc_pet_dirs
raw_scans["need_to_process"] = need_to_process

In [77]:
# Save raw_scans to a CSV file
outf = op.join(PATHS["metadata"], "log", f"raw_pet_scans_{now()}.csv")
raw_scans.to_csv(outf, index=False)
print(f"Saved raw_scans to {outf}")

Saved raw_scans to /mnt/coredata/processing/leads/metadata/log/raw_pet_scans_2024-04-22-19-12-02.csv


# Copy MRI files

In [89]:
from concurrent.futures import ThreadPoolExecutor

In [109]:
def copy_freesurfer(scan, rm_existing=False):
    """Copy FreeSurfer directory to the processed MRI directory."""
    # Add processed MRI directories to raw_scans
    fs_dir_old = "/mnt/coredata/Projects/LEADS/data_f7p1/freesurfer_processing"

    # Create the processed MRI directory, if it doesn't already exist
    if op.islink(scan["mri_dir"]):
        os.unlink(scan["mri_dir"])
    os.makedirs(scan["mri_dir"], exist_ok=True)

    # Copy the FreeSurfer directory to the processed MRI directory
    fs_scan_dir_old = op.join(fs_dir_old, f"{scan['subj']}_MRI_T1_{scan['mri_date']}")
    fs_scan_dir_new = op.join(scan["mri_dir"], f"freesurfer_7p1")
    fs_scan_link_new = fs_scan_dir_new.replace("freesurfer_7p1", "freesurfer")
    if op.isdir(fs_scan_dir_old):
        if op.isdir(fs_scan_dir_new) and rm_existing:
            shutil.rmtree(fs_scan_dir_new)
        if not op.isdir(fs_scan_dir_new):
            # Find files in fs_scan_dir_old that are dirs
            subfiles = [
                f
                for f in os.listdir(fs_scan_dir_old)
                if not op.islink(op.join(fs_scan_dir_old, f))
            ]
            if len(subfiles) > 0:
                os.makedirs(fs_scan_dir_new, exist_ok=True)
                for f in subfiles:
                    shutil.copytree(
                        op.join(fs_scan_dir_old, f),
                        op.join(fs_scan_dir_new, f),
                        symlinks=True,
                    )

            # Add a symlink to the new freesurfer directory
            if op.islink(fs_scan_link_new):
                os.unlink(fs_scan_link_new)
            os.symlink(fs_scan_dir_new, fs_scan_link_new)

In [112]:
from concurrent.futures import ThreadPoolExecutor

max_workers = 16
mris_to_copy = raw_scans.drop_duplicates(["subj", "mri_date"]).to_dict(orient="records")
print(f"Copying FreeSurfer directories for {len(mris_to_copy)} MRIs")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(copy_freesurfer, mris_to_copy)

Copying FreeSurfer directories for 1092 MRIs


In [67]:
# Add processed MRI directories to raw_scans
cp_freesurfer = True
fs_dir_old = "/mnt/coredata/Projects/LEADS/data_f7p1/freesurfer_processing"
for idx, scan in raw_scans.drop_duplicates(["subj", "mri_date"]).iterrows():
    if scan["subj"] != "LDS0370008":
        continue

    # Create the processed MRI directory, if it doesn't already exist
    if op.islink(scan["mri_dir"]):
        os.unlink(scan["mri_dir"])
    os.makedirs(scan["mri_dir"], exist_ok=True)

    # Copy the FreeSurfer directory to the processed MRI directory
    fs_scan_dir_old = op.join(fs_dir_old, f"{scan['subj']}_MRI_T1_{scan['mri_date']}")
    fs_scan_dir_new = op.join(scan["mri_dir"], f"freesurfer_7p1")
    fs_scan_link_new = fs_scan_dir_new.replace("freesurfer_7p1", "freesurfer")
    if op.isdir(fs_scan_dir_old):
        if cp_freesurfer and not op.isdir(fs_scan_dir_new):
            # Find files in fs_scan_dir_old that are dirs
            subfiles = [
                f
                for f in os.listdir(fs_scan_dir_old)
                if not op.islink(op.join(fs_scan_dir_old, f))
            ]
            if len(subfiles) > 0:
                os.makedirs(fs_scan_dir_new, exist_ok=True)
                for f in subfiles:
                    shutil.copytree(
                        op.join(fs_scan_dir_old, f),
                        op.join(fs_scan_dir_new, f),
                        symlinks=True,
                    )

            # Add a symlink to the new freesurfer directory
            if op.islink(fs_scan_link_new):
                os.unlink(fs_scan_link_new)
            os.symlink(fs_scan_dir_new, fs_scan_link_new)

# Consolidate dirs in raw

In [67]:
subdirs = ["fbb", "fdg", "ftp", "mri"]
# for d in subdirs:
#     os.makedirs(op.join(PATHS["processed"], d), exist_ok=True)
PATHS["raw"]

'/mnt/coredata/processing/leads/data/raw'

In [24]:
# Move raw scan directories from raw/<scan_type>/<subj> to raw/<subj>
scan_type_dirs = [
    op.join(PATHS["raw"], f)
    for f in os.listdir(PATHS["raw"])
    if op.isdir(op.join(PATHS["raw"], f))
]
for scan_type_dir in scan_type_dirs:
    subj_dirs_old = [
        op.join(scan_type_dir, f)
        for f in os.listdir(scan_type_dir)
        if op.isdir(op.join(scan_type_dir, f))
    ]
    for subj_dir_old in subj_dirs_old:
        subj_dir_new = op.join(PATHS["raw"], op.basename(subj_dir_old))
        subj_scan_dirs_old = [
            op.join(subj_dir_old, f)
            for f in os.listdir(subj_dir_old)
            if op.isdir(op.join(subj_dir_old, f))
        ]
        # Make the new subject directory if it doesn't already exist
        if subj_scan_dirs_old:
            os.makedirs(subj_dir_new, exist_ok=True)
        for subj_scan_dir_old in subj_scan_dirs_old:
            # Move the scan directory to its new location
            shutil.move(subj_scan_dir_old, subj_dir_new)

In [64]:
def move_newdata_to_raw(
    newdata_dir, raw_dir, overwrite=False, cleanup=True, verbose=True
):
    """Move scans from newdata to raw, keeping file hierarchies intact.

    Parameters
    ----------
    newdata_dir : str
        The directory containing the new scan data. Must format like:
            <newdata_dir>/<subj>/<...>/<nifti_or_dicom_files>
    raw_dir : str
        The directory to move the new scan data to. Structure after
        moving will be:
            <raw_dir>/<subj>/<...>/<nifti_or_dicom_files>
    overwrite : bool
        If True, overwrite existing scan directories in raw_dir with
        directories from newdata_dir. If False, skip existing
        directories.
    cleanup : bool
        If True, remove all files and folders from newdata_dir after
        moving everything eligible to be moved to raw_dir.
    verbose : bool
        If True, print messages about what is happening as the function
        runs.
    """

    def do_cleanup():
        """Remove all files and folders from newdata."""
        if verbose:
            print(f"  Cleaning up {newdata_dir}")
        for file in os.listdir(newdata_dir):
            filepath = op.join(newdata_dir, file)
            if op.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

    # Ensure the base directory paths are absolute and normalized
    newdata_dir = op.normpath(newdata_dir)
    raw_dir = op.normpath(raw_dir)

    # Find all nifti and dicom files in newdata
    check_exts = (".nii", ".nii.gz", ".IMA", ".dcm")
    glob_files = []
    for ext in check_exts:
        glob_files.extend(glob(op.join(newdata_dir, f"**/*{ext}"), recursive=True))

    if verbose:
        title = "Moving newdata to raw"
        print(title, "-" * len(title), sep="\n")
    if len(glob_files) == 0:
        if verbose:
            print(f"  No nifti or dicom files found in {newdata_dir}")
        do_cleanup()
        return

    # Find all unique nifti- or dicom-containing directories in newdata
    source_dirs = set([op.dirname(f) for f in glob_files])
    if verbose:
        print(
            f"  Found {len(source_dirs)} nifti- or dicom-containing directories in {newdata_dir}"
        )

    for source_dir in source_dirs:
        # Create a matching file hierarchy in raw as in newdata
        target_dir = op.join(raw_dir, op.relpath(source_dir, newdata_dir))

        # Check if the target directory exists
        if op.exists(target_dir):
            # If overwrite is True, remove the existing directory
            if overwrite:
                if verbose:
                    print(f"  Overwriting existing raw directory: {target_dir}")
                shutil.rmtree(target_dir)
            else:
                if verbose:
                    print(f"  Skipping existing raw directory: {target_dir}")
                continue

        # Create the necessary directory structure, then copy source to
        # target
        os.makedirs(op.dirname(target_dir), exist_ok=True)
        shutil.move(source_dir, target_dir)
        if verbose:
            print(f"  Moved {source_dir} to {target_dir}")

    # Clean up empty directories in newdata
    if cleanup:
        do_cleanup()

In [66]:
newdata_dir = "/home/mac/dschonhaut/tmp/leads/restructuring/newdata"
raw_dir = "/home/mac/dschonhaut/tmp/leads/restructuring/raw"

move_newdata_to_raw(
    newdata_dir=newdata_dir,
    raw_dir=raw_dir,
    overwrite=False,
    cleanup=True,
    verbose=True,
)

Moving newdata to raw
---------------------
  Found 7 nifti- or dicom-containing directories in /home/mac/dschonhaut/tmp/leads/restructuring/newdata
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/b/aa
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/a
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/c/aa/aaa
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/c/aa/aab
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/b/ab
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/c/ab/aab
  Skipping existing raw directory: /home/mac/dschonhaut/tmp/leads/restructuring/raw/c/ab/aaa
  Cleaning up /home/mac/dschonhaut/tmp/leads/restructuring/newdata


# Setup PET directories

In [None]:
import general.basic.helper_funcs as hf
import general.basic.str_methods as strm

In [24]:
raw_scansf = glob_sort_mtime(op.join(PATHS["metadata"], "log", "raw_pet_scans_*.csv"))[
    0
]
raw_scans = pd.read_csv(raw_scansf)

# Remove rows with missing data
raw_scans = raw_scans.dropna().reset_index(drop=True)
raw_scans = raw_scans.drop(935).reset_index(drop=True)

# Fix raw PET filepaths
replace_vals = {
    "raw/fbb": "raw",
    "raw/fdg": "raw",
    "raw/ftp": "raw",
}
raw_scans["raw_petf"] = raw_scans["raw_petf"].apply(
    lambda x: strm.str_replace(x, replace_vals)
)

print(f"reading {raw_scansf}")
print(f"raw_scans: {raw_scans.shape}")

reading /mnt/coredata/processing/leads/metadata/log/raw_pet_scans_2024-03-27-22-54-15.csv
raw_scans: (2065, 12)


In [49]:
# Fix raw PET filepaths
replace_vals = {
    "raw/fbb": "raw",
    "raw/fdg": "raw",
    "raw/ftp": "raw",
}
raw_scans["raw_petf"] = raw_scans["raw_petf"].apply(
    lambda x: strm.str_replace(x, replace_vals)
)

In [50]:
# Check that all expected files and directories exist
raw_scans["raw_petf_exists"] = raw_scans["raw_petf"].apply(lambda x: op.isfile(x))
raw_scans["mri_dir_exists"] = raw_scans["mri_dir"].apply(lambda x: op.isdir(x))
raw_scans["proc_pet_dir_exists"] = raw_scans["proc_pet_dir"].apply(
    lambda x: op.isdir(x)
)

In [68]:
raw_scans["scan_tag"] = raw_scans.apply(
    lambda x: "_".join([x["subj"], x["scan_type"], x["scan_date"]]), axis=1
)

In [69]:
raw_scans.head()

Unnamed: 0,subj,dx,scan_date,scan_type,visit,raw_petf,mri_date,days_to_mri,fs_dir,mri_dir,proc_pet_dir,need_to_process,raw_petf_exists,mri_dir_exists,proc_pet_dir_exists,scan_tag
0,LDS0070120,CN,2019-06-19,FBB,1,/mnt/coredata/processing/leads/data/raw/LDS007...,2019-06-20,1,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True,True,True,False,LDS0070120_FBB_2019-06-19
1,LDS0070120,CN,2021-07-14,FDG,1,/mnt/coredata/processing/leads/data/raw/LDS007...,2021-07-13,1,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True,True,True,False,LDS0070120_FDG_2021-07-14
2,LDS0070120,CN,2019-06-20,FTP,1,/mnt/coredata/processing/leads/data/raw/LDS007...,2019-06-20,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True,True,True,False,LDS0070120_FTP_2019-06-20
3,LDS0070166,EOAD,2019-08-22,FBB,1,/mnt/coredata/processing/leads/data/raw/LDS007...,2019-08-22,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True,True,True,False,LDS0070166_FBB_2019-08-22
4,LDS0070166,EOAD,2020-09-16,FBB,2,/mnt/coredata/processing/leads/data/raw/LDS007...,2020-09-16,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True,True,True,False,LDS0070166_FBB_2020-09-16


In [74]:
raw_scans.shape, raw_scans.query("(need_to_process==True)").shape

((2064, 16), (2064, 16))

In [76]:
import os
import os.path as op
import shutil


def setup_pet_proc_dirs(raw_scans=None, overwrite=False, verbose=True):
    """Create processed PET directories and link to associated MRIs.

    For each scan that needs to be processed, there must already be:
    1. A raw PET file in .nii format
    2. A processed MRI directory that will be linked to

    This function then does the following for each scan:
    1. Creates new processed PET directory
    2. Copies raw PET file to processed PET directory and renames it
    3. Creates symbolic link from processed PET directory to the
       associated MRI directory

    Parameters
    ----------
    raw_scans : DataFrame
        A pandas DataFrame with columns 'raw_petf', 'scan_tag',
        'mri_dir', and 'proc_pet_dir', which hold paths to raw PET .nii
        files, scan tags ("<subj>_<tracer>_<scan_date>"), processed MRI
        directories that will be used to process each PET scan, and
        target directories for processed PET data that will be created
        by this function, respectively.
    overwrite : bool, optional
        If True, overwrite existing processed PET directories if they
        exist. If False, skip scans with existing processed PET
        directories.
    verbose : bool, optional
        If True, output status messages during execution.

    Returns
    -------
    None
    """
    # Print the welcome message
    if verbose:
        title = f"\nCreating processed PET directories"
        print(title, "-" * len(title), sep="\n")

    # Load the most recently saved raw_scans spreadsheet if not provided
    if raw_scans is None:
        raw_scansf = glob_sort_mtime(
            op.join(PATHS["metadata"], "log", "raw_pet_scans_*.csv")
        )[0]
        if verbose:
            print(f"  Reading {raw_scansf}")
        raw_scans = pd.read_csv(raw_scansf)

    # Filter scans that need to be processed
    raw_scans = raw_scans.query("(need_to_process==True)").reset_index(drop=True)
    if verbose:
        print(f"  {raw_scans.shape[0]} scans to process")

    # Loop over each scan and do directory setup
    for idx, scan in raw_scans.iterrows():
        # Make sure the raw PET file exists
        if not op.isfile(scan["raw_petf"]):
            if verbose:
                print(
                    f"  Skipping {scan['scan_tag']} due to missing raw PET file: {scan['raw_petf']}"
                )
            continue
        elif not scan["raw_petf"].endswith(".nii"):
            if verbose:
                print(
                    f"  Skipping {scan['scan_tag']} as raw PET file does not end in .nii: {scan['raw_petf']}"
                )
            continue
        # Make sure the MRI directory exists
        if not op.isdir(scan["mri_dir"]):
            if verbose:
                print(
                    f"  Skipping {scan['scan_tag']} due to missing MRI directory: {scan['mri_dir']}"
                )
            continue
        # Remove existing processed PET directories if overwrite is True
        if op.isdir(scan["proc_pet_dir"]):
            if overwrite:
                if verbose:
                    print(
                        f"  Removing existing directory and its contents: {scan['proc_pet_dir']}"
                    )
                shutil.rmtree(scan["proc_pet_dir"])
            else:
                if verbose:
                    print(
                        f"  Skipping {scan['scan_tag']} due to existing processed PET directory: {scan['proc_pet_dir']}"
                    )
                continue

        # Create the processed PET directory
        os.makedirs(scan["proc_pet_dir"])

        # Copy the raw PET file to the processed PET directory
        infile = scan["raw_petf"]
        outfile = op.join(scan["proc_pet_dir"], f"{scan['scan_tag']}.nii")
        shutil.copy(infile, outfile)

        # Create a symlink to the processed MRI directory
        link_src = scan["mri_dir"]
        link_dst = op.join(scan["proc_pet_dir"], "mri")
        os.symlink(link_src, link_dst)

    if verbose:
        print("")

In [51]:
raw_scans.groupby(["scan_type", "visit"]).agg(
    {
        "raw_petf_exists": hf.count_pct,
        "mri_dir_exists": hf.count_pct,
        "proc_pet_dir_exists": hf.count_pct,
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,raw_petf_exists,mri_dir_exists,proc_pet_dir_exists
scan_type,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FBB,1,614/614 (100.0%),614/614 (100.0%),0/614 (0.0%)
FBB,2,238/238 (100.0%),238/238 (100.0%),0/238 (0.0%)
FBB,3,94/94 (100.0%),94/94 (100.0%),0/94 (0.0%)
FBB,4,26/26 (100.0%),26/26 (100.0%),0/26 (0.0%)
FDG,1,156/156 (100.0%),156/156 (100.0%),0/156 (0.0%)
FTP,1,604/604 (100.0%),604/604 (100.0%),0/604 (0.0%)
FTP,2,219/219 (100.0%),219/219 (100.0%),0/219 (0.0%)
FTP,3,84/84 (100.0%),84/84 (100.0%),0/84 (0.0%)
FTP,4,29/29 (100.0%),29/29 (100.0%),0/29 (0.0%)


In [52]:
raw_scans.groupby(["need_to_process", "scan_type", "visit"]).size()

need_to_process  scan_type  visit
True             FBB        1        614
                            2        238
                            3         94
                            4         26
                 FDG        1        156
                 FTP        1        604
                            2        219
                            3         84
                            4         29
dtype: int64

In [56]:
raw_scans.iloc[0]["proc_pet_dir"]

'/mnt/coredata/processing/leads/data/processed/LDS0070120/FBB_2019-06-19'

# Do other stuff

In [33]:
file_renaming_map = {
    "FBB": {
        op.join(pet_dir_old, "compwm_ref_mask.nii"): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
    },
    "FTP": {
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
        op.join(pet_dir_old, ""): op.join(mri_dir, ""),
    },
    "MRI-T1": {},
}

# copy mask files from PET to MRI proc dirs
for idx, scan in (
    raw_scans.query("(scan_type==['FBB','FTP'])").sort_values("scan_date").iterrows()
):
    try:
        globstr = f"/mnt/coredata/Projects/LEADS/data_f7p1/processed/{scan['subj']}/Timepoint*/{scan['scan_type']}_{scan['scan_date']}"
        pet_dir_old = glob(globstr)[0]
    except IndexError:
        print(
            f"WARNING: {scan['subj']} {scan['scan_type']} scan on {scan['scan_date']} is missing a processed PET dir in {globstr}"
        )
        continue
    try:
        globstr = f"/mnt/coredata/Projects/LEADS/data_f7p1/processed/{scan['subj']}/Timepoint*/MRI_T1_{scan['mri_date']}"
        mri_dir_old = glob(globstr)[0]
    except IndexError:
        print(
            f"WARNING: {scan['subj']} MRI scan on {scan['scan_date']} is missing a processed MRI dir in {globstr}"
        )
        continue

    # copy mask files from PET to MRI proc dirs
    mri_dir = scan["mri_dir"]



In [13]:
raw_scans.query("(subj=='LDS0370008')")

Unnamed: 0,subj,dx,scan_date,scan_type,visit,raw_petf,mri_date,days_to_mri,fs_dir,mri_dir,proc_pet_dir,need_to_process
680,LDS0370008,EOAD,2018-08-15,FBB,1,/mnt/coredata/processing/leads/data/raw/fbb/LD...,2018-08-15,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
681,LDS0370008,EOAD,2019-09-19,FBB,2,/mnt/coredata/processing/leads/data/raw/fbb/LD...,2019-09-19,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
682,LDS0370008,EOAD,2020-10-28,FBB,3,/mnt/coredata/processing/leads/data/raw/fbb/LD...,2020-10-29,1,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
683,LDS0370008,EOAD,2021-11-16,FBB,4,/mnt/coredata/processing/leads/data/raw/fbb/LD...,2021-11-03,13,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
684,LDS0370008,EOAD,2018-08-27,FTP,1,/mnt/coredata/processing/leads/data/raw/ftp/LD...,2018-08-15,12,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
685,LDS0370008,EOAD,2019-10-03,FTP,2,/mnt/coredata/processing/leads/data/raw/ftp/LD...,2019-09-19,14,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
686,LDS0370008,EOAD,2020-10-29,FTP,3,/mnt/coredata/processing/leads/data/raw/ftp/LD...,2020-10-29,0,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True
687,LDS0370008,EOAD,2021-12-09,FTP,4,/mnt/coredata/processing/leads/data/raw/ftp/LD...,2021-11-03,36,/mnt/coredata/processing/leads/data/freesurfer...,/mnt/coredata/processing/leads/data/processed/...,/mnt/coredata/processing/leads/data/processed/...,True


In [157]:
raw_scans.drop_duplicates("subj")["dx"].value_counts(dropna=False)

EOAD       396
EOnonAD    122
CN          99
NaN          1
Name: dx, dtype: int64

raw_scans: (2067, 12)


0