In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

## load and merge Clinica clinical data, BIDS participant files, and t1/flair/fmri path files

In these steps, we clean up the participant and image modality files created by Clinica. 

We ran Clinica in a subject-specific manner (one subject at a time) because this really sped up the process. But after we ran Clinica on all subjects, we need to merge their BIDS output directories and files into one directory and file. 

In [None]:
# This cell merges all clinical data files created by Clinica into one big file

# Set this to your BIDS root
bids_root = Path("/N/project/statadni/20250922_Saige/adni_db/bids/participants/")

all_dfs = []
missing_subjects = []

# Loop over all subject directories
for sub_dir in sorted(p for p in bids_root.iterdir() if p.is_dir() and p.name.startswith("sub-")):
    sessions_file = sub_dir / f"{sub_dir.name}_sessions.tsv"

    if not sessions_file.exists():
        # sessions file missing for this subject
        missing_subjects.append(sub_dir.name)
        continue

    try:
        df = pd.read_csv(sessions_file, sep="\t")

        # Add subject identifier if not already present
        if "participant_id" not in df.columns and "subject" not in df.columns:
            df["participant_id"] = sub_dir.name

        all_dfs.append(df)
    except Exception as e:
        print(f"Error reading {sessions_file}: {e}")

# Merge all dataframes (union of columns)
if all_dfs:
    merged = pd.concat(all_dfs, ignore_index=True, sort=True)

    out_file = bids_root / "all_subjects_sessions.tsv"
    merged.to_csv(out_file, sep="\t", index=False)

    print(f"Saved merged file to: {out_file}")
    print(f"Shape: {merged.shape[0]} rows × {merged.shape[1]} columns")
else:
    print("No sessions.tsv files were successfully read.")

# Report subjects that had no sessions file
if missing_subjects:
    print(f"\nSubjects missing *_sessions.tsv ({len(missing_subjects)}):")
    for sub in missing_subjects:
        print(sub)

Saved merged file to: /N/project/statadni/20250922_Saige/adni_db/bids/participants/all_subjects_sessions.tsv
Shape: 11588 rows × 786 columns

Subjects missing *_sessions.tsv (124):
sub-ADNI002S5256
sub-ADNI003S0907
sub-ADNI003S0981
sub-ADNI003S1057
sub-ADNI003S4555
sub-ADNI005S0546
sub-ADNI005S0553
sub-ADNI006S1130
sub-ADNI007S0101
sub-ADNI007S0698
sub-ADNI007S1206
sub-ADNI010S0420
sub-ADNI014S0169
sub-ADNI014S0519
sub-ADNI014S0520
sub-ADNI014S0563
sub-ADNI016S0702
sub-ADNI016S1117
sub-ADNI016S1326
sub-ADNI018S0055
sub-ADNI018S0142
sub-ADNI020S1288
sub-ADNI021S0159
sub-ADNI021S0276
sub-ADNI021S0984
sub-ADNI022S0096
sub-ADNI022S0130
sub-ADNI022S1097
sub-ADNI022S1351
sub-ADNI022S1394
sub-ADNI023S0042
sub-ADNI023S0058
sub-ADNI023S0061
sub-ADNI023S0126
sub-ADNI023S0217
sub-ADNI023S0331
sub-ADNI023S0376
sub-ADNI023S0887
sub-ADNI023S0926
sub-ADNI023S1046
sub-ADNI024S0985
sub-ADNI027S0116
sub-ADNI027S0307
sub-ADNI027S0408
sub-ADNI027S0644
sub-ADNI027S0835
sub-ADNI027S1045
sub-ADNI029S1318
sub

In [None]:
# Root folder where all sub-*/ses-* live
participants_root = Path("/N/project/statadni/20250922_Saige/adni_db/bids/participants")

In [None]:
def extract_sub_ses(tsv_path: Path):
    """
    Extract sub-XXX and ses-YYY from a path like:
    .../participants/sub-XXX/ses-YYY/conversion_invo/v0/file.tsv
    """
    sub_id = tsv_path.parents[3].name   # sub-XXX
    ses_id = tsv_path.parents[2].name   # ses-YYY
    return sub_id, ses_id

In [None]:
# Find all participants.tsv across subjects/sessions
pattern = "sub-*/ses-*/conversion_invo/v0/participants.tsv"
all_participants = []
count = 0

for tsv_path in participants_root.glob(pattern):
    count += 1
    try:
        df = pd.read_csv(tsv_path, sep="\t")
    except Exception as e:
        print("Error reading:", tsv_path, e)
        continue

    # Add participant and session IDs
    sub_id, ses_id = extract_sub_ses(tsv_path)
    df["participant_id"] = sub_id
    df["session_id"] = ses_id

    all_participants.append(df)

# Merge
merged_participants = pd.concat(all_participants, ignore_index=True, sort=True)

# Drop columns where ALL rows are NaN
merged_participants = merged_participants.dropna(axis=1, how="all")

# Save
out_file = participants_root / "all_participants.tsv"
merged_participants.to_csv(out_file, sep="\t", index=False)

merged_participants.head(), merged_participants.shape

In [None]:
modalities = {
    "fmri":  "fmri_paths.tsv",
    "flair": "flair_paths.tsv",
    "t1":    "t1_paths.tsv"
}

all_rows = []

for modality, filename in modalities.items():
    pattern = f"sub-*/ses-*/conversion_invo/v0/{filename}"

    for tsv_path in participants_root.glob(pattern):
        try:
            df = pd.read_csv(tsv_path, sep="\t")
        except Exception as e:
            print("Error reading:", tsv_path, e)
            continue

        sub_id, ses_id = extract_sub_ses(tsv_path)
        df["participant_id"] = sub_id
        df["session_id"] = ses_id
        df["modality"] = modality

        all_rows.append(df)

# Combine everything
merged_paths = pd.concat(all_rows, ignore_index=True, sort=True)

# Drop columns with all NaN
merged_paths = merged_paths.dropna(axis=1, how="all")

# Save
out_file = participants_root / "all_modalities_paths.tsv"
merged_paths.to_csv(out_file, sep="\t", index=False)

merged_paths.head(), merged_paths.shape

## BIDS-ify the recovered T1w dicoms (where I manually ran dcm2niix successfully)

I manually went through the missing_T1.csv and checked the dicom directories to verify if the T1w DICOMS exist for the scan date (that matches the scan date of the resting state scan). 

I will try to script this process, but due to the inconsistent directory names, it is difficult. 

Out of the 76 subjects who were missing a T1w image (but had a resting-state scan), I could re-run dcm2niix manually myself on 49 subjects. 

These subjects need to be converted into BIDS format and moved from the DICOM directories into the BIDS directory. We also need to add a row to the t1_paths.tsv file for these subjects. 

In [65]:
m_t1 = pd.read_csv('/N/project/statadni/20250922_Saige/QC/missing_t1_saved_BIDSify.csv')

In [66]:
m_t1.shape

(49, 6)

In [67]:
m_t1.columns

Index(['Image_ID', 'Subject_ID', 'VISCODE', 'Path', 'JSON_path', 'Notes'], dtype='object')

In [144]:
m_t1_path = m_t1['Path'].str.split(pat="/", expand=True)

In [146]:
import glob

In [158]:
! ls /N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/114_S_6039/*/*/*.nii

/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/114_S_6039/Accelerated_Sagittal_MPRAGE/2017-07-21_12_13_00.0/2017-07-21_12_13_00.0_0_3.nii


In [168]:
m_t1.head()

Unnamed: 0,Image_ID,Subject_ID,VISCODE,Path,JSON_path,Notes
0,879211,114_S_6039,bl,/N/project/statadni/20231212_ADR012021_UtahBac...,/N/project/statadni/20250922_Saige/adni_db/bid...,
1,896824,941_S_4365,m66,/N/project/statadni/20231212_ADR012021_UtahBac...,/N/project/statadni/20250922_Saige/adni_db/bid...,
2,223896,002_S_1261,m48,/N/project/statadni/20231212_ADR012021_UtahBac...,/N/project/statadni/20250922_Saige/adni_db/bid...,
3,233437,002_S_1280,m48,/N/project/statadni/20231212_ADR012021_UtahBac...,/N/project/statadni/20250922_Saige/adni_db/bid...,
4,180734,002_S_2010,bl,/N/project/statadni/20231212_ADR012021_UtahBac...,/N/project/statadni/20250922_Saige/adni_db/bid...,


In [219]:
data_nii = []
for i, row in m_t1_path.iterrows():
    path1 = "/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/" 
    path2 = m_t1_path.iloc[i][7] + "/" 
    path3 = m_t1_path.iloc[i][8] + "/"
    path4 = "/" + m_t1_path.loc[i][10] + "/"
    for j, name in enumerate(t1_names):
        path = path1 + path2 + path3 + name + path4 + "*.nii"
        path_go = path1 + path2 + path3 + name + "/*" + m_t1_path.loc[i][10].split(".")[0].replace("-","").replace("_","") + "*.nii"
        if glob.glob(path):
            print(path)
            data_nii.append(path)
        if glob.glob(path_go):
            print(path_go)
            data_nii.append(path_go)

/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/114_S_6039/Accelerated_Sagittal_MPRAGE/2017-07-21_12_13_00.0/*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/941_S_4365/Accelerated_Sagittal_MPRAGE/2017-08-28_14_06_46.0/*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_1261/MPRAGE/*20110314160431*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_1280/MPRAGE/*20110504132634*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20100624142128*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20101022153121*.nii
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20110122123155*.nii
/N/project/stat

In [220]:
df_nii = pd.DataFrame(data_nii,columns=['nii_path'])

In [221]:
data_json = []
for i, row in m_t1_path.iterrows():
    path1 = "/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/" 
    path2 = m_t1_path.iloc[i][7] + "/" 
    path3 = m_t1_path.iloc[i][8] + "/"
    path4 = "/" + m_t1_path.loc[i][10] + "/"
    for j, name in enumerate(t1_names):
        path = path1 + path2 + path3 + name + path4 + "*.json"
        path_go = path1 + path2 + path3 + name + "/*" + m_t1_path.loc[i][10].split(".")[0].replace("-","").replace("_","") + "*.json"
        if glob.glob(path):
            print(path)
            data_json.append(path)
        if glob.glob(path_go):
            print(path_go)
            data_json.append(path_go)

/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/114_S_6039/Accelerated_Sagittal_MPRAGE/2017-07-21_12_13_00.0/*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNI3_MRI_fMRI_M/941_S_4365/Accelerated_Sagittal_MPRAGE/2017-08-28_14_06_46.0/*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_1261/MPRAGE/*20110314160431*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_1280/MPRAGE/*20110504132634*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20100624142128*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20101022153121*.json
/N/project/statadni/20231212_ADR012021_UtahBackup/ImagingData/dicomUnzipped/ADNIGO_MRI_fMRI_F/002_S_2010/MPRAGE/*20110122123155*.json
/N/proje

In [222]:
df_json = pd.DataFrame(data_json,columns=['json_path'])

In [223]:
df_nii_json = pd.concat([df_nii, df_json],axis=1)

In [224]:
df_nii_json_t1 = pd.concat([df_nii_json, m_t1],axis=1)

In [225]:
df_nii_json_t1.to_csv('missing_T1w_bidsify_paths.csv', index=False)

Here is the bash script code to copy over the data using the CSV file created in the cell above:

```while IFS=, read -r c1 c2 c3 c4 c5 c6 c7 c8; do subid="sub-ADNI${c4//_/}"; sesid=`echo ${c7} | awk -F / '{print $10}'`; if [[ -e /N/project/statadni/20250922_Saige/adni_db/bids/participants/${subid}/${sesid}/anat/ ]]; then cp $c1 /N/project/statadni/20250922_Saige/adni_db/bids/participants/${subid}/${sesid}/anat/${subid}_${sesid}_T1w.nii.gz; elif [[ ! -e /N/project/statadni/20250922_Saige/adni_db/bids/participants/${subid}/${sesid}/anat/ ]]; then mkdir /N/project/statadni/20250922_Saige/adni_db/bids/participants/${subid}/${sesid}/anat/; cp $c1 /N/project/statadni/20250922_Saige/adni_db/bids/participants/${subid}/${sesid}/anat/${subid}_${sesid}_T1w.nii.gz; fi; done < missing_T1w_bidsify_paths.csv ```