In [1]:
import os
import pandas as pd

def parse_timestamp(dirname):
    """Attempt to parse timestamp from directory name or return None."""
    parts = dirname.split('_')
    if len(parts) > 1 and parts[1].isdigit() and len(parts[1]) == 14:
        return parts[1]
    return None

# Define the path to the RawData directory
base_path = "/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Data/"

# Prepare a list to store the data
data = []

# Traverse the RawData directory
for subject_id in os.listdir(base_path):
    subject_path = os.path.join(base_path, subject_id, "Baseline")
    if os.path.isdir(subject_path):
        # List all directories starting with 'anat_'
        scans = [d for d in os.listdir(subject_path) if d.startswith('anat_') and os.path.isdir(os.path.join(subject_path, d))]
        
        # Initialize a variable to hold the path of the earliest scan
        earliest_scan = None
        
        # Check for directories with timestamp
        timestamped_scans = [s for s in scans if parse_timestamp(s) is not None]
        if timestamped_scans:
            # Sort by timestamp
            timestamped_scans.sort(key=parse_timestamp)
            earliest_scan = timestamped_scans[0]
        elif 'anat_NORM' in scans:
            earliest_scan = 'anat_NORM'
        
        # Verify the presence of the smwc1pT1.nii file in the chosen directory
        if earliest_scan and 'smwc1pT1.nii' in os.listdir(os.path.join(subject_path, earliest_scan)):
            # Append the filename to the path
            full_path = os.path.join(subject_path, earliest_scan, 'smwc1pT1.nii')
            # Store the subject ID and the full path to the file
            data.append([subject_id, full_path])

# Convert the list to a DataFrame
df = pd.DataFrame(data, columns=["src_subject_id", "PathToFile"])
df['src_subject_id'] = df['src_subject_id'].str.replace(r'^(NDAR)_?(.+)$', r'\1_\2', regex=True)

# Save the DataFrame to a CSV file
csv_path = "SMRI_no_target.csv"
df.to_csv(csv_path, index=False)

print(f"Data saved to {csv_path}")


Data saved to SMRI_no_target.csv


In [2]:
import pandas as pd

pathdf = pd.read_csv('SMRI_no_target.csv')
qcData = pd.read_csv('/data/neuromark2/Data/ABCD/Data_info/Demo50/abcd-data-release-5.0/core/imaging/mri_y_qc_incl.csv')
cognitionScores = pd.read_csv('/data/neuromark2/Data/ABCD/Data_info/Demo50/abcd-data-release-5.0/core/imaging/mri_y_tfmr_nback_beh.csv')
demo_df = pd.read_csv('/data/neuromark2/Data/ABCD/Data_info/Demo50/abcd-data-release-5.0/core/abcd-general/abcd_p_demo.csv')

## Filter out by QC

In [3]:
qcData.head()

Unnamed: 0,src_subject_id,eventname,imgincl_t1w_include,imgincl_t2w_include,imgincl_dmri_include,imgincl_rsfmri_include,imgincl_mid_include,imgincl_nback_include,imgincl_sst_include
0,NDAR_INV003RTV85,baseline_year_1_arm_1,1,1,1,1,1,1,1
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1,1,0,1,0,0
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,1,1,1,1,1,1,1
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,1,1,1,1,1,1,1
4,NDAR_INV00CY2MDM,2_year_follow_up_y_arm_1,1,1,1,1,1,1,1


In [5]:
qcBaseLine.head()

Unnamed: 0,src_subject_id,eventname,imgincl_t1w_include,imgincl_t2w_include,imgincl_dmri_include,imgincl_rsfmri_include,imgincl_mid_include,imgincl_nback_include,imgincl_sst_include
0,NDAR_INV003RTV85,baseline_year_1_arm_1,1,1,1,1,1,1,1
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,1,1,1,0,1,0,0
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,1,1,1,1,1,1,1
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,1,1,1,1,1,1,1
5,NDAR_INV00CY2MDM,baseline_year_1_arm_1,1,1,1,1,1,1,1


In [6]:
qcBaseLine.shape

(11793, 9)

In [7]:
qcData_n_back_good = qcBaseLine[qcBaseLine['imgincl_t1w_include']==1]

In [8]:
qcData_n_back_good.shape

(11394, 9)

In [9]:
## Filter ROI Feature data to get only QC qualified rows
path_df_filtered = pathdf[pathdf['src_subject_id'].isin(qcData_n_back_good['src_subject_id'])]
print(path_df_filtered.shape)
path_df_filtered.head()

(11288, 2)


Unnamed: 0,src_subject_id,PathToFile
0,NDAR_INVVMHC7BEX,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...
1,NDAR_INVP65TV30G,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...
2,NDAR_INV6NYCB4Y6,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...
3,NDAR_INVB5PN305C,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...
4,NDAR_INVNTJ9TYDB,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...


## Add target scores

In [10]:
## Consider all baseline_year1 cognitive scores only

cognitionScores_baseline = cognitionScores[cognitionScores['eventname']=='baseline_year_1_arm_1']
print(cognitionScores_baseline.shape)
cognitionScores_baseline.head()

(11465, 582)


Unnamed: 0,src_subject_id,eventname,tfmri_nb_all_beh_0b_nt,tfmri_nb_all_beh_0bngf_nt,tfmri_nb_all_beh_0bngfl_nt,tfmri_nb_all_beh_0bngfnl_nt,tfmri_nb_all_beh_0bngft_nt,tfmri_nb_all_beh_0bnf_nt,tfmri_nb_all_beh_0bnfl_nt,tfmri_nb_all_beh_0bnfnl_nt,...,tfmri_nb_r2_beh_cplace_stdrt,tfmri_nb_r2_beh_cpf_mrt,tfmri_nb_r2_beh_cpf_nt,tfmri_nb_r2_beh_cpf_rate,tfmri_nb_r2_beh_cpf_stdrt,tfmri_nb_r2_beh_ngf_nt,tfmri_nb_r2_beh_nf_nt,tfmri_nb_r2_beh_place_nt,tfmri_nb_r2_beh_pf_nt,tfmri_nb_r2_beh_total_nt
0,NDAR_INV003RTV85,baseline_year_1_arm_1,80,20,6,10,4,20,6,10,...,274.964102,1059.85,20.0,1.0,373.057534,20.0,20.0,20.0,20.0,80.0
1,NDAR_INV005V6D2C,baseline_year_1_arm_1,80,20,6,10,4,20,6,10,...,188.153155,560.2,10.0,0.5,325.304234,20.0,20.0,20.0,20.0,80.0
2,NDAR_INV007W6H7B,baseline_year_1_arm_1,80,20,6,10,4,20,6,10,...,343.280686,966.052632,19.0,0.95,261.176925,20.0,20.0,20.0,20.0,80.0
3,NDAR_INV00BD7VDC,baseline_year_1_arm_1,80,20,6,10,4,20,6,10,...,355.749778,745.526316,19.0,0.95,226.421281,20.0,20.0,20.0,20.0,80.0
6,NDAR_INV00CY2MDM,baseline_year_1_arm_1,80,20,6,10,4,20,6,10,...,295.177115,1057.315789,19.0,0.95,340.387859,20.0,20.0,20.0,20.0,80.0


In [11]:
#  Keep just the following values: 
#  Processing Speed      i.e tfmri_nb_all_beh_c0b_mrt
#  Response Variability  i.e tfmri_nb_all_beh_c2b_stdrt
#  Attention/Vigilance   i.e tfmri_nb_all_beh_c0b_rate
#  Working Memory        i.e tfmri_nb_all_beh_c2b_rate

cognition_score_targets = cognitionScores_baseline[['src_subject_id', 'tfmri_nb_all_beh_c0b_mrt', 'tfmri_nb_all_beh_c2b_stdrt', 'tfmri_nb_all_beh_c0b_rate', 'tfmri_nb_all_beh_c2b_rate']]
print(cognition_score_targets.shape)
cognition_score_targets.head()

(11465, 5)


Unnamed: 0,src_subject_id,tfmri_nb_all_beh_c0b_mrt,tfmri_nb_all_beh_c2b_stdrt,tfmri_nb_all_beh_c0b_rate,tfmri_nb_all_beh_c2b_rate
0,NDAR_INV003RTV85,1002.77027,328.801944,0.925,0.8375
1,NDAR_INV005V6D2C,680.978261,365.771595,0.575,0.425
2,NDAR_INV007W6H7B,926.74359,289.349291,0.975,0.925
3,NDAR_INV00BD7VDC,837.661972,324.648865,0.8875,0.925
6,NDAR_INV00CY2MDM,1119.535714,299.607143,0.7,0.8875


In [12]:
merged_df_with_targets = pd.merge(path_df_filtered, cognition_score_targets, on='src_subject_id', how='left')

In [13]:
merged_df_with_targets.head()

Unnamed: 0,src_subject_id,PathToFile,tfmri_nb_all_beh_c0b_mrt,tfmri_nb_all_beh_c2b_stdrt,tfmri_nb_all_beh_c0b_rate,tfmri_nb_all_beh_c2b_rate
0,NDAR_INVVMHC7BEX,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...,842.77027,402.945445,0.925,0.85
1,NDAR_INVP65TV30G,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...,840.855263,388.188307,0.95,0.7875
2,NDAR_INV6NYCB4Y6,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...,813.622222,252.688252,0.5625,0.675
3,NDAR_INVB5PN305C,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...,882.5,421.34204,0.725,0.725
4,NDAR_INVNTJ9TYDB,/data/neuromark2/Data/ABCD/Data_BIDS_5/Raw_Dat...,580.769231,343.081438,0.65,0.7375


In [14]:
merged_df_with_targets.shape

(11288, 6)

In [16]:
merged_df_clean = merged_df_with_targets.dropna()

In [17]:
merged_df_clean.isna().sum()

src_subject_id                0
PathToFile                    0
tfmri_nb_all_beh_c0b_mrt      0
tfmri_nb_all_beh_c2b_stdrt    0
tfmri_nb_all_beh_c0b_rate     0
tfmri_nb_all_beh_c2b_rate     0
dtype: int64

In [22]:
merged_df_clean['tfmri_nb_all_beh_c2b_rate_norm'] = (merged_df_clean['tfmri_nb_all_beh_c2b_rate'] - merged_df_clean['tfmri_nb_all_beh_c2b_rate'].mean()) / merged_df_clean['tfmri_nb_all_beh_c2b_rate'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_clean['tfmri_nb_all_beh_c2b_rate_norm'] = (merged_df_clean['tfmri_nb_all_beh_c2b_rate'] - merged_df_clean['tfmri_nb_all_beh_c2b_rate'].mean()) / merged_df_clean['tfmri_nb_all_beh_c2b_rate'].std()


In [24]:
merged_df_clean.to_csv('SMRI_Dataset_Earliest.csv', index=False)