Filtering PPMI image data

- only include fmri runs with valid num TRs (600 for AP/PA, 240 for LR/RL)
- only include fmri runs with valid TR (1.0 for AP/PA, 2.5 for LR/RL)
- only include t1w/fmri runs with valid image shape and resolution (up to tolerance)
- only include complete sessions with 1 fmri and 1 t1w

This results in 1066 sessions from 715 subjects (416 prodromal, 256 pd, 43 control)

In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
df = pd.read_csv("../metadata/PPMI_BIDS_index.csv", dtype={"sub": str, "ses": str})
print(df.shape)
print(df["sub"].nunique())
df.head(10)

(9784, 11)
1482


Unnamed: 0,sub,ses,task,dir,suffix,shape,pixdim,tr,num_trs,path,run
0,100005,20210127,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100005/ses-20210127/func/sub-100005_ses-20...,
1,100005,20220202,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100005/ses-20220202/func/sub-100005_ses-20...,
2,100006,20201216,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100006/ses-20201216/func/sub-100006_ses-20...,
3,100006,20211215,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100006/ses-20211215/func/sub-100006_ses-20...,
4,100006,20230322,rest,AP,bold,"[66, 66, 52]","[3.05, 3.05, 3.0]",1.0,10.0,sub-100006/ses-20230322/func/sub-100006_ses-20...,
5,100006,20230322,rest,PA,bold,"[66, 66, 52]","[3.05, 3.05, 3.0]",1.0,600.0,sub-100006/ses-20230322/func/sub-100006_ses-20...,
6,100007,20201209,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100007/ses-20201209/func/sub-100007_ses-20...,
7,100007,20220119,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100007/ses-20220119/func/sub-100007_ses-20...,
8,100007,20230329,rest,AP,bold,"[66, 66, 52]","[3.05, 3.05, 3.0]",1.0,10.0,sub-100007/ses-20230329/func/sub-100007_ses-20...,
9,100007,20230329,rest,PA,bold,"[66, 66, 52]","[3.05, 3.05, 3.0]",1.0,600.0,sub-100007/ses-20230329/func/sub-100007_ses-20...,


In [3]:
# filter for correct number of trs (600 for AP/PA, 240 for LR/RL)
print(df.groupby(["suffix", "dir", "num_trs"]).agg({"path": "count"}))
ntr_mask = (
    (df["suffix"] == "T1w")
    | ((df["dir"] == "AP") & (df["num_trs"] == 600))
    | ((df["dir"] == "PA") & (df["num_trs"] == 600))
    | ((df["dir"] == "LR") & (df["num_trs"] == 240))
    | ((df["dir"] == "RL") & (df["num_trs"] == 240))
)
print(ntr_mask.sum(), df.loc[ntr_mask, "sub"].nunique())

                    path
suffix dir num_trs      
bold   AP  10.0      839
           240.0       3
           600.0     119
       LR  10.0      890
           240.0      62
       PA  10.0      121
           172.0       1
           240.0       3
           600.0     838
       RL  10.0       26
           13.0        1
           44.0        1
           56.0        1
           92.0       25
           120.0       1
           240.0     986
           600.0       1
3365 1461


In [4]:
# filter for correct tr (1.0 for AP/PA, 2.5 for LR/RL)
print(df.groupby(["suffix", "dir", "tr"]).agg({"path": "count"}))
tr_mask = (
    (df["suffix"] == "T1w")
    | ((df["dir"] == "AP") & (df["tr"] == 1.0))
    | ((df["dir"] == "PA") & (df["tr"] == 1.0))
    | ((df["dir"] == "LR") & (df["tr"] == 2.5))
    | ((df["dir"] == "RL") & (df["tr"] == 2.5))
)
print(tr_mask.sum(), df.loc[tr_mask, "sub"].nunique())

                  path
suffix dir tr         
bold   AP  1.00    808
           1.03    123
           1.05      1
           1.10     24
           2.50      5
       LR  1.03      1
           2.50    930
           2.53      2
           2.54     17
           2.69      1
           10.00     1
       PA  1.00    811
           1.03    122
           1.05      1
           1.10     24
           2.50      5
       RL  0.00      2
           0.01      1
           1.03      1
           2.50   1019
           2.52      1
           2.53      1
           2.54     12
           2.55      3
           2.72      1
           5.02      1
4928 1370


In [5]:
# filter for correct image shape (inconsistent shape can indicate bad acquisition)
shapes = np.array([ast.literal_eval(shape) for shape in df["shape"]])
print(df.loc[df["suffix"] == "T1w"].groupby(["suffix", "shape"]).agg({"path": "count"}))
print(df.groupby(["suffix", "dir", "shape"]).agg({"path": "count"}))


def close_enough(shapes, target, tol=8):
    return np.abs(shapes - target).max(axis=1) <= tol


shape_mask = (
    ((df["suffix"] == "T1w") & close_enough(shapes, [192, 256, 256]))
    | ((df["dir"] == "AP") & close_enough(shapes, [66, 66, 52]))
    | ((df["dir"] == "PA") & close_enough(shapes, [66, 66, 52]))
    | ((df["dir"] == "LR") & close_enough(shapes, [64, 64, 40]))
    | ((df["dir"] == "RL") & close_enough(shapes, [64, 64, 40]))
)
print(shape_mask.sum(), df.loc[shape_mask, "sub"].nunique())

                        path
suffix shape                
T1w    [172, 256, 256]     1
       [174, 256, 256]     1
       [176, 256, 256]     1
       [178, 256, 256]     1
       [180, 256, 256]     1
       [182, 256, 256]     1
       [184, 256, 256]    10
       [188, 256, 256]   124
       [192, 256, 256]  1189
       [256, 256, 188]     2
       [256, 256, 191]     1
       [256, 256, 192]     1
       [256, 256, 193]     1
       [384, 512, 512]    22
       [512, 512, 384]     4
                           path
suffix dir shape               
bold   AP  [128, 128, 52]    10
           [64, 64, 40]       5
           [64, 64, 52]      24
           [66, 66, 52]     917
           [66, 66, 64]       5
       LR  [128, 128, 40]     1
           [64, 64, 17]       1
           [64, 64, 40]    1087
           [64, 64, 42]       3
           [64, 64, 43]       1
           [64, 64, 44]      23
           [64, 64, 45]       6
           [64, 64, 46]       5
           [64, 64, 47]    

In [6]:
# filter for correct image resolution
pixdims = np.array([ast.literal_eval(pixdim) for pixdim in df["pixdim"]])
print(df.loc[df["suffix"] == "T1w"].groupby(["suffix", "pixdim"]).agg({"path": "count"}))
print(df.groupby(["suffix", "dir", "pixdim"]).agg({"path": "count"}))


def close_enough(pixdims, target, tol=0.2):
    return np.abs(pixdims - target).max(axis=1) <= tol


pixdim_mask = (
    ((df["suffix"] == "T1w") & close_enough(pixdims, [1.0, 1.0, 1.0], 0.01))
    | ((df["dir"] == "AP") & close_enough(pixdims, [3.05, 3.05, 3.0]))
    | ((df["dir"] == "PA") & close_enough(pixdims, [3.05, 3.05, 3.0]))
    | ((df["dir"] == "LR") & close_enough(pixdims, [3.5, 3.5, 3.5]))
    | ((df["dir"] == "RL") & close_enough(pixdims, [3.5, 3.5, 3.5]))
)
print(pixdim_mask.sum(), df.loc[pixdim_mask, "sub"].nunique())

                        path
suffix pixdim               
T1w    [0.5, 0.5, 0.5]    26
       [1.0, 1.0, 1.0]  1332
       [1.2, 1.0, 1.0]     2
                               path
suffix dir pixdim                  
bold   AP  [1.57, 1.57, 3.0]     10
           [3.03, 3.03, 3.02]    24
           [3.05, 3.05, 3.0]    922
           [3.5, 3.5, 3.5]        4
           [3.75, 3.75, 3.6]      1
       LR  [1.75, 1.75, 3.5]      1
           [3.05, 3.05, 3.0]      1
           [3.38, 3.38, 3.5]      1
           [3.5, 3.5, 3.5]     1051
           [3.5, 3.5, 3.6]       82
           [3.5, 3.5, 4.5]        1
           [3.5, 3.5, 7.0]        1
       PA  [1.57, 1.57, 3.0]     10
           [3.03, 3.03, 3.02]    24
           [3.05, 3.05, 3.0]    924
           [3.5, 3.5, 3.5]        4
           [3.75, 3.75, 3.6]      1
       RL  [2.92, 2.92, 3.5]    240
           [3.05, 3.05, 3.0]      1
           [3.5, 3.5, 17.5]       1
           [3.5, 3.5, 3.3]        1
           [3.5, 3.5, 3.5] 

In [7]:
# final mask
valid_mask = ntr_mask & tr_mask & shape_mask & pixdim_mask
print(valid_mask.sum(), df.loc[valid_mask, "sub"].nunique())

3121 1340


In [8]:
# check that filters were applied correctly
valid_df = df.loc[valid_mask]
print(valid_df.groupby(["suffix", "dir", "num_trs"]).agg({"path": "count"}))
print(valid_df.groupby(["suffix", "dir", "tr"]).agg({"path": "count"}))
print(valid_df.loc[df["suffix"] == "T1w"].groupby(["suffix", "shape"]).agg({"path": "count"}))
print(valid_df.groupby(["suffix", "dir", "shape"]).agg({"path": "count"}))
print(valid_df.loc[df["suffix"] == "T1w"].groupby(["suffix", "pixdim"]).agg({"path": "count"}))
print(valid_df.groupby(["suffix", "dir", "pixdim"]).agg({"path": "count"}))

                    path
suffix dir num_trs      
bold   AP  600.0     119
       LR  240.0      41
       PA  600.0     681
       RL  240.0     959
                path
suffix dir tr       
bold   AP  1.0   119
       LR  2.5    41
       PA  1.0   681
       RL  2.5   959
                        path
suffix shape                
T1w    [184, 256, 256]    10
       [188, 256, 256]   124
       [192, 256, 256]  1187
                         path
suffix dir shape             
bold   AP  [66, 66, 52]   119
       LR  [64, 64, 40]    39
           [64, 64, 44]     2
       PA  [66, 66, 52]   681
       RL  [64, 64, 40]   918
           [64, 64, 41]     1
           [64, 64, 42]     2
           [64, 64, 43]     2
           [64, 64, 44]    22
           [64, 64, 45]     2
           [64, 64, 46]     6
           [64, 64, 47]     4
           [64, 64, 48]     2
                        path
suffix pixdim               
T1w    [1.0, 1.0, 1.0]  1321
                              path
suffix 

In [9]:
# only keep complete sessions with 1 valid t1 and 1 valid bold
counts = valid_df.groupby(["sub", "ses"]).agg(
    {
        "suffix": [
            ("t1w", lambda s: (s == "T1w").sum()),
            ("bold", lambda s: (s == "bold").sum()),
        ]
    }
)
counts.columns = ["t1w", "bold"]
counts = counts.reset_index()
counts.groupby(["t1w", "bold"]).agg({"ses": "count"})
counts["complete"] = (counts["t1w"] == 1) & (counts["bold"] == 1)
print(counts["complete"].sum(), counts.loc[counts["complete"], "sub"].nunique())
print(counts.loc[counts["complete"]].groupby("sub").agg({"ses": "count"}).value_counts())

1066 715
ses
1      455
2      169
3       91
Name: count, dtype: int64


In [10]:
# remove incomplete sessions
complete_sub_ses = {(row["sub"], row["ses"]) for _, row in counts.iterrows() if row["complete"]}
complete_mask = np.array(
    [(row["sub"], row["ses"]) in complete_sub_ses for _, row in valid_df.iterrows()]
)
complete_df = valid_df.loc[complete_mask, :]
print(complete_df.shape, complete_df["sub"].nunique(), len(complete_df.groupby(["sub", "ses"])))
complete_df.head(10)

(2132, 11) 715 1066


Unnamed: 0,sub,ses,task,dir,suffix,shape,pixdim,tr,num_trs,path,run
10,100012,20210302,,,T1w,"[192, 256, 256]","[1.0, 1.0, 1.0]",,,sub-100012/ses-20210302/anat/sub-100012_ses-20...,
12,100012,20210302,rest,RL,bold,"[64, 64, 46]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100012/ses-20210302/func/sub-100012_ses-20...,
13,100012,20220207,,,T1w,"[192, 256, 256]","[1.0, 1.0, 1.0]",,,sub-100012/ses-20220207/anat/sub-100012_ses-20...,
15,100012,20220207,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100012/ses-20220207/func/sub-100012_ses-20...,
16,100012,20230109,,,T1w,"[192, 256, 256]","[1.0, 1.0, 1.0]",,,sub-100012/ses-20230109/anat/sub-100012_ses-20...,
18,100012,20230109,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100012/ses-20230109/func/sub-100012_ses-20...,
20,100018,20230316,,,T1w,"[192, 256, 256]","[1.0, 1.0, 1.0]",,,sub-100018/ses-20230316/anat/sub-100018_ses-20...,
22,100018,20230316,rest,PA,bold,"[66, 66, 52]","[3.05, 3.05, 3.0]",1.0,600.0,sub-100018/ses-20230316/func/sub-100018_ses-20...,
33,100267,20210219,,,T1w,"[192, 256, 256]","[1.0, 1.0, 1.0]",,,sub-100267/ses-20210219/anat/sub-100267_ses-20...,
35,100267,20210219,rest,RL,bold,"[64, 64, 40]","[3.5, 3.5, 3.5]",2.5,240.0,sub-100267/ses-20210219/func/sub-100267_ses-20...,


In [11]:
complete_df.to_csv("../metadata/PPMI_BIDS_complete.csv", index=False)

In [12]:
# check counts per category
meta_df = pd.read_csv("../metadata/PPMI_T1+fMRI_11_07_2025.csv", dtype={"Subject": str})
labels = meta_df.loc[:, ["Subject", "Group"]].drop_duplicates().set_index("Subject", inplace=False)
complete_subs = complete_df["sub"].unique()
complete_labels = labels.loc[complete_subs]
print(complete_labels.value_counts())

Group    
Prodromal    416
PD           256
Control       43
Name: count, dtype: int64


In [13]:
complete_path_list = complete_df["path"].tolist()
np.savetxt("../metadata/PPMI_BIDS_complete.txt", complete_path_list, fmt="%s")

In [14]:
complete_sub_ses_list = sorted(f"{sub} {ses}" for sub, ses in complete_sub_ses)
with open("../metadata/PPMI_BIDS_complete_sub_ses.txt", "w") as f:
    print("\n".join(complete_sub_ses_list), file=f)