In [1]:
import os
import numpy as np
import scipy.io
import pandas as pd
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from utils import *  # NeuroGraph
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix

## Hyperparameters setting

In [2]:
class Args:
    # choose dataset form: ADNI(BOLD), HCP(CORR), BOLD+CORR
    dataset = "ADNI"
    # data path
    dataset_dir = "../../data/ADNI/"
    # choose from: GCNConv, GINConv, SGConv, GeneralConv, GATConv
    model = "GCNConv" 
    num_classes = 2  # ADNI - binary classification
    weight_decay = 0.0005
    batch_size = 16
    hidden_mlp = 64
    hidden = 32
    num_layers = 3
    runs = 1
    device = "cpu" if model != "GATConv" else "cpu"
    lr = 1e-4
    epochs = 200
    seed = 42
args = Args()
fix_seed(args.seed)

## Reading our Datasets
use our HCP correlation matrix dataset, train/test split file, label file.

HCP data is downloaded from https://drive.google.com/drive/folders/166wCCtPOEL0O25FxzwB0I8AQA8b6Q9U1?usp=drive_link 

other files are in the data folder

use our ADNI dataset

In [3]:
def read_adni_data():
    fMRI_path = args.dataset_dir + "fmri_signal.mat"
    ICV_path = args.dataset_dir + "ICV.mat"
    AGE_path = args.dataset_dir + "AGE.mat"
    DX_path = args.dataset_dir + "DX.mat"
    gender_path = args.dataset_dir + "gender.mat"
    fMRI_data_path = args.dataset_dir + "fMRIdata_ADNI2_ADNI3.csv"
    # participants_path = r'./data/ADNI/participants.tsv'

    # read fMRI_path
    fmri_data = scipy.io.loadmat(fMRI_path)['fmri_signal']
    fMRI_data = [fmri_data[i][0] for i in range(len(fmri_data))]

    # read ICV_path
    icv_data = scipy.io.loadmat(ICV_path)['ICV']
    ICV_data = pd.DataFrame([icv_data[i][0] for i in range(len(icv_data))])

    # read AGE_path
    age_data = scipy.io.loadmat(AGE_path)['AGE']
    AGE_data = pd.DataFrame([age_data[i][0] for i in range(len(age_data))])

    # read gender_path
    gender_data = scipy.io.loadmat(gender_path)['gender']
    gender_data = pd.DataFrame([gender_data[i][0] for i in range(len(gender_data))])

    # read DX_path
    dx_data = scipy.io.loadmat(DX_path)['DX']
    DX_data = pd.DataFrame([dx_data[i][0] for i in range(len(dx_data))])

    # for all above variable, add a df.insert(0, 'Image_ID', range(1, 1 + len(fMRI_data))) to add Image_ID column
    for df in [ICV_data, AGE_data, gender_data, DX_data]:
        df.insert(0, 'Image_ID', range(1, 1 + len(fMRI_data)))

    # give their column names, EstimatedTotalIntraCranialVol, Age, Gender, Diagnosis
    ICV_data.columns = ['Image_ID', 'EstimatedTotalIntraCranialVol']
    AGE_data.columns = ['Image_ID', 'Age']
    gender_data.columns = ['Image_ID', 'Gender']
    DX_data.columns = ['Image_ID', 'Diagnosis']
    Image_ID = ICV_data['Image_ID']

    data_dict = {
        'fMRI_data': fMRI_data,
        'ICV_data': ICV_data,
        'AGE_data': AGE_data,
        'gender_data': gender_data,
        'DX_data': DX_data
    }
    return data_dict

In [4]:
# Label path
labels_file = args.dataset_dir + 'y.csv'
# Load labels
labels_df = pd.read_csv(labels_file)

# for ADNI Dataset
if args.dataset == "ADNI":
    adni_data = read_adni_data()
    fMRI_data = adni_data['fMRI_data']
    ICV_data = adni_data['ICV_data']
    AGE_data = adni_data['AGE_data']
    gender_data = adni_data['gender_data']
    DX_data = adni_data['DX_data']

    # only keep healthy control and AD. namely 2 and 0
    labels_df = labels_df[labels_df['Diagnosis'].isin([2, 0])].reset_index(drop=True)
    # change all 2 to 1
    labels_df['Diagnosis'] = labels_df['Diagnosis'].replace({2: 1})

    dataset = []
    # traverse the labels_df by i
    for i in range(len(labels_df)):
        # print('i:', i)
        IID = labels_df['IID'][i]
        y = labels_df['Diagnosis'][i]
        # turn y to <class 'torch.Tensor'>
        y = torch.tensor(y, dtype=torch.long)
        # z-score normalization for each column of each subject
        subject_data = fMRI_data[IID]
        # fill 0 with 1
        subject_data[subject_data == 0] = 1
        subject_data = (subject_data - np.mean(subject_data, axis=0)) / np.std(subject_data, axis=0)
        # x = torch.tensor(subject_data[:100, :], dtype=torch.float)
        
        edge_attr = pd.read_csv(args.dataset_dir + 'fmri_edge/' + 'cosinecosine_/cosinecosine_' + str(IID) + '.csv')
        x = torch.tensor(edge_attr.to_numpy(), dtype=torch.float)
        np.fill_diagonal(edge_attr.to_numpy(), 0)

        # 获取10%最大的元素的阈值
        threshold = np.percentile(edge_attr, 90)
        
        # 只保留大于阈值的元素，其他置为0
        edge_attr[edge_attr < threshold] = 0

        edge_index = np.vstack(np.nonzero(edge_attr.to_numpy()))

        # only keep edge_attr with edge_index's value
        # 只保留与非零 edge_index 对应的 edge_attr
        filtered_edge_attr = edge_attr.to_numpy()[edge_index[0], edge_index[1]]

        # 确保 edge_attr 是一维张量
        filtered_edge_attr = torch.tensor(filtered_edge_attr, dtype=torch.float)

        data = Data(x=x, edge_index=torch.tensor(edge_index, dtype=torch.long), edge_attr=filtered_edge_attr, y=y)


        # choose from the special case
        if data.edge_index[0].shape[0] != 1000:
            continue

        dataset.append(data)

    # get train and test data
    train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
    
# for HCP 
elif args.dataset == "HCP":
    # train_ids = pd.read_csv(args.dataset_dir + 'ids_train.csv')['IID'].values
    # test_ids = pd.read_csv(args.dataset_dir + 'ids_test.csv')['IID'].values
    
    # train_data = [data for data in (load_mat_data(iid) for iid in train_ids) if data is not None]
    # test_data = [data for data in (load_mat_data(iid) for iid in test_ids) if data is not None]
    pass
    

<!-- # # first get a participants df, based on gender, dx, age, image_id
# # Assuming you have the following dataframes: gender_data, DX_data, AGE_data, and ICV_data

# # Merge gender_data, DX_data, AGE_data, and ICV_data on 'Image_ID'
# participants_df = pd.merge(gender_data, DX_data, on='Image_ID')
# participants_df = pd.merge(participants_df, AGE_data, on='Image_ID')

# # add ICV is to remove the NaN values in the ICV column
# participants_df = pd.merge(participants_df, ICV_data, on='Image_ID')

# # Rename the columns
# participants_df.columns = ['Image_ID', 'Gender', 'Diagnosis', 'Age', 'EstimatedTotalIntraCranialVol']

# participants_df = pd.DataFrame(participants_df)
# participants_df = participants_df.dropna()

# # filter parcitipants_df, only keep Gender is M or F, and Diagnosis is CN or 'Dementia', and change M to 0, F to 1, CN to 1, 'Dementia' to 0

# # Create a copy to avoid modifying the original DataFrame
# filtered_participants_df = participants_df[(participants_df['Gender'].isin(['M', 'F'])) & 
#                                            (participants_df['Diagnosis'].isin(['CN', 'MCI', 'Dementia']))].copy()

# # Converting categorical values to numeric codes using .loc
# filtered_participants_df.loc[:, 'Gender'] = filtered_participants_df['Gender'].replace({'M': 0, 'F': 1})
# filtered_participants_df.loc[:, 'Diagnosis'] = filtered_participants_df['Diagnosis'].replace({'CN': 2, 'MCI': 1,'Dementia': 0})

# # delete the EstimatedTotalIntraCranialVol column
# filtered_participants_df = filtered_participants_df.drop(columns=['EstimatedTotalIntraCranialVol'])

# # save this to a participants.tsv file
# filtered_participants_df.to_csv(os.path.join(data_path, 'participants.tsv'), index=False)

# filtered_Image_ID = filtered_participants_df['Image_ID']

# # get HC, MCI, AD in Image_ID
# filtered_Image_ID_HC = filtered_participants_df[filtered_participants_df['Diagnosis'] == 2]['Image_ID']
# # filtered_Image_ID_MCI = filtered_participants_df[filtered_participants_df['Diagnosis'] == 1]['Image_ID']
# filtered_Image_ID_AD = filtered_participants_df[filtered_participants_df['Diagnosis'] == 0]['Image_ID']

# import numpy as np
# import pandas as pd

# def split_data(image_ids, train_frac=0.8, random_seed=42):
#     """
#     Split the data into training and testing sets.
    
#     Parameters:
#     image_ids (pd.Series): Series of image IDs to be split.
#     train_frac (float): Fraction of data to be used for training.
#     random_seed (int): Seed for the random number generator.
    
#     Returns:
#     train_ids (pd.Series): Training set of image IDs.
#     test_ids (pd.Series): Test set of image IDs.
#     """
#     train_ids = image_ids.sample(frac=train_frac, random_state=random_seed)
#     test_ids = image_ids.drop(train_ids.index)
#     return train_ids, test_ids

# # Apply the function to each category
# training_Image_ID_HC, test_Image_ID_HC = split_data(filtered_Image_ID_HC)
# # training_Image_ID_MCI, test_Image_ID_MCI = split_data(filtered_Image_ID_MCI)
# training_Image_ID_AD, test_Image_ID_AD = split_data(filtered_Image_ID_AD)


# # Combine the training and test sets
# # training_Image_ID = pd.concat([training_Image_ID_HC, training_Image_ID_MCI, training_Image_ID_AD])
# # test_Image_ID = pd.concat([test_Image_ID_HC, test_Image_ID_MCI, test_Image_ID_AD])

# training_Image_ID = pd.concat([training_Image_ID_HC, training_Image_ID_AD])
# test_Image_ID = pd.concat([test_Image_ID_HC, test_Image_ID_AD])
 -->


<!-- # import pandas as pd

# # Load the labels data
# labels_file = data_path + 'y.csv'
# labels_df = pd.read_csv(labels_file)

# # Shuffle the data before splitting
# labels_df = labels_df.sample(frac=1, random_state=42).reset_index(drop=True)

# # Define the train-test split ratio
# train_ratio = 0.8
# train_size = int(len(labels_df) * train_ratio)

# # Split the data into train and test sets
# train_df = labels_df[:train_size]
# test_df = labels_df[train_size:]

# # Extract the 'IID' values for train and test sets
# train_ids = train_df['IID'].values
# test_ids = test_df['IID'].values

# # Display the first few values of train and test IDs
# label_train = train_df['Diagnosis'].values
# label_test = test_df['Diagnosis'].values
# print('train value counts:', np.unique(label_train, return_counts=True))
# print('test value counts:', np.unique(label_test, return_counts=True))

# # save them to ids_train.csv and ids_test.csv, columns: 'IID'. only ids
# # train_df['IID'].to_csv('./data_ADNI/ids_train.csv', index=False)
# # test_df['IID'].to_csv('./data_ADNI/ids_test.csv', index=False)
# train_df.to_csv(data_path + '/ids_train.csv', index=False)
# test_df.to_csv(data_path + '/ids_test.csv', index=False) -->

<!-- # Data paths
# labels_file = args.dataset_dir + 'y.csv'

# Load labels
# labels_df = pd.read_csv(labels_file)

# # only keep healthy control and AD. namely 2 and 0
# labels_df = labels_df[labels_df['Diagnosis'].isin([2, 0])].reset_index(drop=True)

# # change all 2 to 1
# labels_df['Diagnosis'] = labels_df['Diagnosis'].replace({2: 1})

# dataset = []

        
# traverse the labels_df by i


# for i in range(len(labels_df)):
#     # print('i:', i)
#     IID = labels_df['IID'][i]
#     y = labels_df['Diagnosis'][i]
#     # turn y to <class 'torch.Tensor'>
#     y = torch.tensor(y, dtype=torch.long)
#     # z-score normalization for each column of each subject
#     subject_data = fMRI_data[i]
#     # fill 0 with 1
#     subject_data[subject_data == 0] = 1
#     subject_data = (subject_data - np.mean(subject_data, axis=0)) / np.std(subject_data, axis=0)
#     x = torch.tensor(subject_data[:100, :], dtype=torch.float)
#     edge_attr = pd.read_csv(args.dataset_dir + 'fmri_edge/' + 'pearsonpearson_' + str(i + 1) + '.csv')

#     np.fill_diagonal(edge_attr.to_numpy(), 0)


#     # 获取10%最大的元素的阈值
#     threshold = np.percentile(edge_attr, 90)
    
#     # 只保留大于阈值的元素，其他置为0
#     edge_attr[edge_attr < threshold] = 0


#     edge_index = np.vstack(np.nonzero(edge_attr.to_numpy()))

#     data = Data(x=x, edge_index=torch.tensor(edge_index, dtype=torch.long), edge_attr=torch.tensor(np.nonzero(edge_attr.to_numpy()), dtype=torch.float), y=y)

#     if data.edge_index[0].shape[0] != 1000:
#         continue    

#     dataset.append(data) -->

## Train

In [5]:
# get train and test data
# train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=123)

# create data loaders
train_loader = DataLoader(train_data, args.batch_size, shuffle=True)
val_loader = DataLoader(val_data, args.batch_size, shuffle=False)
test_loader = DataLoader(test_data, args.batch_size, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()

In [7]:
def train(train_loader):
    model.train()
    total_loss = 0
    for data in train_loader:  
        data = data.to(args.device)
        out = model(data) 
        loss = criterion(out, data.y) 
        total_loss +=loss
        loss.backward()
        optimizer.step() 
        optimizer.zero_grad()
    return total_loss/len(train_loader.dataset)

@torch.no_grad()
def test(loader):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(args.device)
            out = model(data)
            probs = F.softmax(out, dim=1)  # Calculate probabilities
            preds = out.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_probs.append(probs.cpu().numpy()[:, 1])  # Keep the probabilities of the positive class
            all_labels.append(data.y.cpu().numpy())
    
    all_preds = np.concatenate(all_preds)
    all_probs = np.concatenate(all_probs)
    all_labels = np.concatenate(all_labels)
    
    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    auroc = roc_auc_score(all_labels, all_probs)
    f1 = f1_score(all_labels, all_preds)
    
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    
    metrics = {
        'accuracy': accuracy,
        'auroc': auroc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'f1_score': f1
    }
    
    return metrics

# test for multiclass
def test_multiclass(loader):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(args.device)
            out = model(data)
            probs = F.softmax(out, dim=1)  # 计算所有类的概率
            preds = out.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
            all_labels.append(data.y.cpu().numpy())
    
    all_preds = np.concatenate(all_preds)
    all_probs = np.concatenate(all_probs, axis=0)
    all_labels = np.concatenate(all_labels)
    
    # 计算指标
    accuracy = accuracy_score(all_labels, all_preds)
    # 如果你需要计算AUROC，对于多分类问题，可以使用平均方法
    auroc = roc_auc_score(all_labels, all_probs, multi_class='ovr')  # 'ovr'表示一对多（one-vs-rest）策略
    f1 = f1_score(all_labels, all_preds, average='weighted')  # 使用加权平均
    # confusion_matrix 需要转换为多分类版本
    cm = confusion_matrix(all_labels, all_preds)
    
    # sensitivity 和 specificity 的计算需要根据每个类分别计算
    sensitivities = []
    specificities = []
    for i in range(cm.shape[0]):
        tp = cm[i, i]
        fn = cm[i, :].sum() - tp
        fp = cm[:, i].sum() - tp
        tn = cm.sum() - (tp + fn + fp)
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        sensitivities.append(sensitivity)
        specificities.append(specificity)
    
    metrics = {
        'accuracy': accuracy,
        'auroc': auroc,
        'sensitivity': np.mean(sensitivities),
        'specificity': np.mean(specificities),
        'f1_score': f1
    }
    
    return metrics


checkpoints_dir = './checkpoints/'
if not os.path.exists(checkpoints_dir):
    os.makedirs(checkpoints_dir)

val_acc_history, test_acc_history, test_loss_history = [],[],[]
seed = 42
for index in range(args.runs):
    gnn = eval(args.model)
    model = ResidualGNNs(args, train_data, args.hidden, args.hidden_mlp, args.num_layers, gnn).to(args.device) ## apply GNN*
    print(model)
    total_params = sum(p.numel() for p in model.parameters())
    optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    loss, test_acc = [], []
    best_val_auroc, best_val_loss = 0.0,0.0
    for epoch in range(args.epochs):
        loss = train(train_loader)
        train_metrics = test(train_loader)
        val_metrics = test(val_loader)
        test_metrics = test(test_loader)
        print("epoch: {}, loss: {}, \ntrain_metrics:{}, \nval_metrics:{}, \ntest_metrics:{}".format(epoch, np.round(loss.item(),6), train_metrics, val_metrics, test_metrics))
        
        
        if val_metrics['auroc'] > best_val_auroc:
            best_val_auroc = val_metrics['auroc']
            torch.save(model.state_dict(), checkpoints_dir+args.dataset+args.model+'task-checkpoint-best-auroc.pkl')

    #test the model
    model.load_state_dict(torch.load(checkpoints_dir+args.dataset+args.model+'task-checkpoint-best-auroc.pkl'))
    model.eval()
    test_acc = test(test_loader)['accuracy']
    test_loss = train(test_loader).item()
    test_acc_history.append(test_acc)
    test_loss_history.append(test_loss)

ResidualGNNs(
  (convs): ModuleList(
    (0): GCNConv(100, 32)
    (1-2): 2 x GCNConv(32, 32)
  )
  (aggr): MeanAggregation()
  (bn): BatchNorm1d(5050, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bnh): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (mlp): Sequential(
    (0): Linear(in_features=5146, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=32, out_features=32, bias=True)
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=32, out_features=2, bias=True

In [8]:
model.load_state_dict(torch.load(checkpoints_dir+args.dataset+args.model+'task-checkpoint-best-auroc.pkl'))
model.eval()
test_acc = test(test_loader)['accuracy']
test_metrics = test(test_loader)
test_loss = train(test_loader).item()
test_acc_history.append(test_acc)
test_loss_history.append(test_loss)
print('test_metrics:', test_metrics)

# save the results to result.txt
with open('result.txt', 'a') as f:
    f.write('dataset: {}, model: {}, seed: {}, test_acc: {}, test_loss: {}, test_metrics: {}\n'.format(args.dataset, args.model, seed, test_acc, test_loss, test_metrics))
    f.write('--------------------------------------------\n')
    f.close()

test_metrics: {'accuracy': 0.853448275862069, 'auroc': 0.913472485768501, 'sensitivity': 0.8588235294117647, 'specificity': 0.8387096774193549, 'f1_score': 0.8957055214723927}
