In [1]:
import pandas as pd 
import numpy as np
import os

FILE_PATH = "../tadpole_dataset_8modality/"

# remove modalities here to try diff combos
modality_names = ["mri", "fdg_pet", "av45_pet", "csf", "cognitive_tests", "clinical"]
patients_dict = {}

for m in modality_names:
    df = pd.read_csv(FILE_PATH + m + ".csv", dtype=str)
    
    # eliminate any rows which have the majority of columns as NaN
    df = df.replace(' ', np.nan)   
    df = df.dropna(thresh= 4)
    print(m + ":", len(df), "non-null values")

    # Make unique ID for each patient
    df["unique_ID"] = df["RID"] + "_" + df["VISCODE"]

    patients_dict[m] = df
    

mri: 12664 non-null values
fdg_pet: 9349 non-null values
av45_pet: 6153 non-null values
csf: 2373 non-null values
cognitive_tests: 12741 non-null values
clinical: 12741 non-null values


In [3]:
from functools import reduce
def merge_dfs(left, right):
    return pd.merge(left, right, on=['unique_ID', "RID", "PTID", "VISCODE"], how='inner')

# Merging all dataframes on 'unique_ID'
merged_df = reduce(merge_dfs, patients_dict.values())


In [5]:
df = merged_df
new_df = None

for modality, modality_df in patients_dict.items():
    # Get columns for the current modality
    modality_columns = modality_df.columns

    # Remove columns with more than 70% NaNs
    nan_threshold = 0.7 * len(df)
    columns_to_keep = df[modality_columns].isna().sum() <= nan_threshold
    df_modality_reduced = df[modality_columns[columns_to_keep]]

    # Calculate initially NaN values (before imputation)
    initially_nan = df_modality_reduced.isna()
    
    #groupby is doing a weird thing where its getting rid of the grouped by column, and the usual resetting index is not working
    df_modality_reduced["PTID_copy"] = df_modality_reduced["PTID"]
    imputed_df = df_modality_reduced.groupby('PTID_copy', as_index=False).fillna(method='ffill')
    imputed_df["PTID_copy"] = imputed_df["PTID"]
    imputed_df = imputed_df.groupby('PTID_copy').fillna(method='bfill')
    imputed_df.fillna(-1, inplace=True)

    # Re-apply the mask to find out which values were filled
    newly_filled = initially_nan & ~imputed_df.isna()

    # Merge the imputed modality-specific DataFrame into the new DataFrame
    if new_df is None:
        new_df = imputed_df
    else:
        new_df = pd.merge(new_df, imputed_df, on=['unique_ID', 'RID', 'PTID', 'VISCODE'], how='outer')

    # Count total and imputed values
    total_values = newly_filled.size
    num_imputed_values = newly_filled.sum().sum()
    imputed_percentage = (num_imputed_values / total_values) * 100
    columns_remain = len(newly_filled.columns)

    # Print the statistics
    print(f"For {modality}, {imputed_percentage:.2f}% of the values in the dataframe were imputed. {columns_remain} columns remain.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_modality_reduced["PTID_copy"] = df_modality_reduced["PTID"]


For mri, 34.80% of the values in the dataframe were imputed. 668 columns remain.
For fdg_pet, 25.30% of the values in the dataframe were imputed. 20 columns remain.
For av45_pet, 3.42% of the values in the dataframe were imputed. 246 columns remain.
For csf, 0.02% of the values in the dataframe were imputed. 14 columns remain.
For cognitive_tests, 0.36% of the values in the dataframe were imputed. 20 columns remain.
For clinical, 0.82% of the values in the dataframe were imputed. 31 columns remain.


In [8]:
#grouping according to the challenge instructions:
"""
Forecasts will be limited to three categories: cognitively normal (CN and SMC in ADNI’s DX field); mild cognitive impairment (EMCI or LMCI); probable Alzheimer’s disease (AD).
"""
new_df['DX_bl'] = new_df['DX_bl'].replace({"SMC":"CN", "LMCI": "MCI", "EMCI": "MCI"})
new_df["DX_bl"].value_counts() 

MCI    767
CN     493
AD     143
Name: DX_bl, dtype: int64

In [13]:
#Mapping of DX for future reference: 0:AD, 1:CN, 2:MCI
new_df["DX_bl"] = new_df["DX_bl"].astype('category').cat.codes.astype('int')

In [10]:
# Convert number columns to floats, string columns to integers, date columns to date times
# convert all columns to numeric
new_df = new_df.apply(pd.to_numeric, errors='ignore')

# for all columns of type object, convert to categorical
for col in new_df.select_dtypes(include='object').columns:
    # if col is date, convert to datetime
    if "DATE" in col:
        new_df[col] = pd.to_datetime(new_df[col], errors='coerce')
        new_df[col] = new_df[col].astype('int64') // 10**9
    else:
        new_df[col] = new_df[col].astype('category').cat.codes.astype('int')

  new_df[col] = new_df[col].astype('int64') // 10**9


In [None]:
# Drop columns containing "update_stamp"
new_df = new_df.drop(columns=[col for col in new_df.columns if "update_stamp" in col])

In [12]:
from sklearn.model_selection import train_test_split
import random

# Set a random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)

# Group by 'unique_ID' and create a list of DataFrames
grouped = [group for _, group in new_df.groupby('unique_ID')]

# Shuffle the list of DataFrames to ensure random distribution
random.shuffle(grouped)

# Determine the split sizes (e.g., 70% train, 15% validation, 15% test)
train_size = int(0.7 * len(grouped))
val_size = int(0.15 * len(grouped))

# Split the data into train, validation, and test sets
train = pd.concat(grouped[:train_size])
val = pd.concat(grouped[train_size:train_size+val_size])
test = pd.concat(grouped[train_size+val_size:])

# Now you have train, val, and test DataFrames with unique patients in each


In [13]:
print(len(train))
print(len(test))
print(len(val))

982
211
210


In [17]:
# Function to save subsets of data for each modality
def save_modality_subsets(modality_name, train_data, val_data, test_data, save_dir):
    old_columns = patients_dict[modality_name].columns
    new_columns = list(set(old_columns).intersection(train_data.columns))
    train_subset = train_data[new_columns]
    val_subset = val_data[new_columns]
    test_subset = test_data[new_columns]

    train_subset.to_csv(os.path.join(save_dir, f"{modality_name}_train.csv"), index=False)
    val_subset.to_csv(os.path.join(save_dir, f"{modality_name}_val.csv"), index=False)
    test_subset.to_csv(os.path.join(save_dir, f"{modality_name}_test.csv"), index=False)

# Main function to save all subsets
def save_all_subsets(train, val, test, modalities, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for modality in modalities:
        save_modality_subsets(modality, train, val, test, save_dir)


In [18]:
# Directory to save the subsets
save_directory = "split_cleaned_numeric_modalities" # Specify your directory here

modalities = ["mri", "fdg_pet", "av45_pet", "csf", "cognitive_tests", "clinical"]

save_all_subsets(train, val, test, modalities, save_directory)


## Turn into pytorch datasetes

In [9]:
import pandas as pd 
import numpy as np
import os
import torch
import sys
sys.path.append('../..')
from common_files.custom_sets import tadpoleDataset
'''
Create datasets for all modalities and modes
'''
modalities = ["mri", "fdg_pet", "av45_pet", "csf", "cognitive_tests", "clinical"]
modes = ["train", "val", "test"]

for mode in modes:
    labels = pd.read_csv(f"split_cleaned_numeric_modalities/clinical_{mode}.csv")["DX_bl"]

    for modality in modalities:

        df = pd.read_csv(f"split_cleaned_numeric_modalities/{modality}_{mode}.csv")

        # Drop identifying columns
        df = df.drop(["unique_ID", "PTID", "RID", "VISCODE"], axis=1)

        # Drop labels which reveal diagnosis
        if modality == "clinical":
            df = df.drop(["DX_bl", "DX"], axis=1)
        if modality == "cognitive_tests":
            df = df.drop(["MMSE_bl"], axis=1)

        # Create pytorch dataset and save it
        dataset = tadpoleDataset(df, labels)
        if not os.path.exists("tadpole_datasets"):
            os.makedirs("tadpole_datasets")
        torch.save(dataset, f"tadpole_datasets/{modality}_{mode}_dataset.pt")

