In [7]:
#from google.colab import drive
#drive.mount('/content/drive')
#%cd drive/MyDrive/MesDossiers/Master CNN/supervised_project/FMRI_ADHD_Classification

In [3]:
%%capture
#!pip install nilearn

In [8]:
import os
import pandas as pd
import urllib.request

# Configuration
pipeline = 'cpac'
strategy = 'filt_global'
derivative = 'func_preproc'
output_dir = "Fmri_Data"
sites = ["NYU"] # Choose the site
num_scans_per_site = 100  # Limit to 10 scans for testing

# URL details for ABIDE
base_url = "https://fcp-indi.s3.amazonaws.com/data/Projects/ABIDE_Initiative/Outputs/"
phenotype_url = "https://fcp-indi.s3.amazonaws.com/data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Download phenotype data
phenotype_path = os.path.join(output_dir, "Phenotypic_V1_0b_preprocessed1.csv")
if not os.path.exists(phenotype_path):
    urllib.request.urlretrieve(phenotype_url, phenotype_path)

# Load the phenotype data
phenotypes = pd.read_csv(phenotype_path)

# Process data for each site
for site in sites:
    site_dir = os.path.join(output_dir, site)
    os.makedirs(site_dir, exist_ok=True)

    # Filter data by site and limit the number of scans
    site_data = phenotypes[phenotypes['SITE_ID'] == site].head(num_scans_per_site)

    # Save the site's phenotypic CSV
    site_pheno_path = os.path.join(site_dir, f"{site}_phenotypic.csv")
    site_data.to_csv(site_pheno_path, index=False)

    # Download each scan
    for _, row in site_data.iterrows():
        file_id = row['FILE_ID']
        numeric_id = file_id.split('_')[-1]  # Extract the numeric part

        # Create directory for each scan
        scan_dir = os.path.join(site_dir, numeric_id)
        os.makedirs(scan_dir, exist_ok=True)

        # Download the .nii.gz file for each scan
        filename = f"{file_id}_{derivative}.nii.gz"
        download_url = f"{base_url}{pipeline}/{strategy}/{derivative}/{filename}"
        local_file_path = os.path.join(scan_dir, filename)

        # Download if not already present
        if not os.path.exists(local_file_path):
            try:
                print(f"Downloading {filename} for subject {numeric_id}")
                urllib.request.urlretrieve(download_url, local_file_path)
                print(f"Downloaded: {filename}")
            except urllib.error.HTTPError as e:
                print(f"Error downloading {filename}: {e}")
        else:
            print(f"{filename} already exists, skipping download.")


NYU_0050952_func_preproc.nii.gz already exists, skipping download.
Downloading no_filename_func_preproc.nii.gz for subject filename
Error downloading no_filename_func_preproc.nii.gz: HTTP Error 404: Not Found
NYU_0050954_func_preproc.nii.gz already exists, skipping download.
NYU_0050955_func_preproc.nii.gz already exists, skipping download.
NYU_0050956_func_preproc.nii.gz already exists, skipping download.
NYU_0050957_func_preproc.nii.gz already exists, skipping download.
NYU_0050958_func_preproc.nii.gz already exists, skipping download.
NYU_0050959_func_preproc.nii.gz already exists, skipping download.
NYU_0050960_func_preproc.nii.gz already exists, skipping download.
NYU_0050961_func_preproc.nii.gz already exists, skipping download.
Downloading NYU_0050962_func_preproc.nii.gz for subject 0050962
Downloaded: NYU_0050962_func_preproc.nii.gz
Downloading NYU_0050964_func_preproc.nii.gz for subject 0050964
Downloaded: NYU_0050964_func_preproc.nii.gz
Downloading NYU_0050965_func_preproc.ni

In [11]:
import os
import pandas as pd
import shutil
import glob

# Define paths
dataset_directory = "Fmri_Data"  # Source directory with fMRI data
output_folder = "Extracted_Fmri_Data"  # Destination directory for consolidated files
os.makedirs(output_folder, exist_ok=True)  # Ensure output directory exists

# Initialize dataset tracker
dataset_tracker = {"Name": [], "DX": [], "Loc": []}

# Load phenotype data for each site and copy .nii.gz files
for site_folder in glob.glob(os.path.join(dataset_directory, "*")):
    site_name = os.path.basename(site_folder)
    pheno_csv_path = os.path.join(site_folder, f"{site_name}_phenotypic.csv")

    if os.path.exists(pheno_csv_path):
        pheno_data = pd.read_csv(pheno_csv_path)

        for scan_folder in glob.glob(os.path.join(site_folder, "*")):
            if os.path.isdir(scan_folder):
                # Use only the scan ID part for naming
                scan_id = os.path.basename(scan_folder)
                gz_file = glob.glob(os.path.join(scan_folder, "*.nii.gz"))
                print(gz_file)

                if gz_file:
                    # Construct filename and copy file
                    gz_filename = f"{scan_id}.nii.gz"  # Simplified filename with only scan ID
                    destination_path = os.path.join(output_folder, gz_filename)
                    shutil.copy(gz_file[0], destination_path)

                    # Record phenotype data
                    pheno_row = pheno_data[pheno_data['FILE_ID'].str.contains(scan_id)]
                    if not pheno_row.empty:
                        dx_value = pheno_row['DX_GROUP'].values[0]
                        dataset_tracker["Name"].append(gz_filename)
                        dataset_tracker["DX"].append(dx_value)
                        dataset_tracker["Loc"].append(destination_path)

# Convert tracker to DataFrame and save as Total_Data_Pheno.csv
df_total = pd.DataFrame(dataset_tracker)
df_total.to_csv(os.path.join(output_folder, "Total_Data_Pheno.csv"), index=False)

# Split into training and validation sets
df_total = df_total.sample(frac=1).reset_index(drop=True)  # Shuffle data
split_index = int(0.8 * len(df_total))  # 80% train, 20% validation
df_train = df_total.iloc[:split_index]
df_val = df_total.iloc[split_index:]

# Save training and validation DataFrames
df_train.to_csv(os.path.join(output_folder, "Training_Data_Pheno.csv"), index=False)
df_val.to_csv(os.path.join(output_folder, "Validation_Data_Pheno.csv"), index=False)

# Display results
print("Files copied to:", output_folder)
print("Total data entries:", len(df_total))
print("Training entries:", len(df_train))
print("Validation entries:", len(df_val))


['Fmri_Data/NYU/0050952/NYU_0050952_func_preproc.nii.gz']
[]
['Fmri_Data/NYU/0050954/NYU_0050954_func_preproc.nii.gz']
['Fmri_Data/NYU/0050955/NYU_0050955_func_preproc.nii.gz']
['Fmri_Data/NYU/0050956/NYU_0050956_func_preproc.nii.gz']
['Fmri_Data/NYU/0050957/NYU_0050957_func_preproc.nii.gz']
['Fmri_Data/NYU/0050958/NYU_0050958_func_preproc.nii.gz']
['Fmri_Data/NYU/0050959/NYU_0050959_func_preproc.nii.gz']
['Fmri_Data/NYU/0050960/NYU_0050960_func_preproc.nii.gz']
['Fmri_Data/NYU/0050961/NYU_0050961_func_preproc.nii.gz']
['Fmri_Data/NYU/0050962/NYU_0050962_func_preproc.nii.gz']
['Fmri_Data/NYU/0050964/NYU_0050964_func_preproc.nii.gz']
['Fmri_Data/NYU/0050965/NYU_0050965_func_preproc.nii.gz']
['Fmri_Data/NYU/0050966/NYU_0050966_func_preproc.nii.gz']
['Fmri_Data/NYU/0050967/NYU_0050967_func_preproc.nii.gz']
['Fmri_Data/NYU/0050968/NYU_0050968_func_preproc.nii.gz']
['Fmri_Data/NYU/0050969/NYU_0050969_func_preproc.nii.gz']
['Fmri_Data/NYU/0050970/NYU_0050970_func_preproc.nii.gz']
['Fmri_Data