In [15]:
# import torch
# import torch.nn as nn
# from torch_geometric.nn import GCNConv, global_mean_pool as gep

# # GNN Model definition for molecule and protein graphs
# class GNNNet(torch.nn.Module):
#     def __init__(self, n_output=1, num_features_pro=52, num_features_mol=78, output_dim=128, dropout=0.2):
#         super(GNNNet, self).__init__()

#         print('GNNNet Loaded')
#         # Molecule GNN layers
#         self.mol_conv1 = GCNConv(num_features_mol, num_features_mol)
#         self.mol_conv2 = GCNConv(num_features_mol, num_features_mol * 2)
#         self.mol_conv3 = GCNConv(num_features_mol * 2, num_features_mol * 4)
#         self.mol_fc_g1 = torch.nn.Linear(num_features_mol * 4, 1024)
#         self.mol_fc_g2 = torch.nn.Linear(1024, output_dim)

#         # Protein GNN layers
#         self.pro_conv1 = GCNConv(num_features_pro, num_features_pro)
#         self.pro_conv2 = GCNConv(num_features_pro * 2, num_features_pro * 4)
#         self.pro_fc_g1 = torch.nn.Linear(num_features_pro * 4, 1024)
#         self.pro_fc_g2 = torch.nn.Linear(1024, output_dim)

#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(dropout)

#         # Combined dense layers
#         self.fc1 = nn.Linear(2 * output_dim, 1024)
#         self.fc2 = nn.Linear(1024, 512)
#         self.out = nn.Linear(512, n_output)

#     def forward(self, data_mol, data_pro):
#         # Molecule forward pass
#         mol_x, mol_edge_index, mol_batch = data_mol.x, data_mol.edge_index, data_mol.batch
#         mol_x = self.mol_conv1(mol_x, mol_edge_index)
#         mol_x = self.relu(mol_x)
#         mol_x = self.mol_conv2(mol_x, mol_edge_index)
#         mol_x = self.relu(mol_x)
#         mol_x = self.mol_conv3(mol_x, mol_edge_index)
#         mol_x = gep(mol_x, mol_batch)  # global pooling
#         mol_x = self.relu(self.mol_fc_g1(mol_x))
#         mol_x = self.dropout(mol_x)
#         mol_x = self.mol_fc_g2(mol_x)

#         # Protein forward pass
#         pro_x, pro_edge_index, pro_batch = data_pro.x, data_pro.edge_index, data_pro.batch
#         pro_x = self.pro_conv1(pro_x, pro_edge_index)
#         pro_x = self.relu(pro_x)
#         pro_x = self.pro_conv2(pro_x, pro_edge_index)
#         pro_x = self.relu(pro_x)
#         pro_x = gep(pro_x, pro_batch)  # global pooling
#         pro_x = self.relu(self.pro_fc_g1(pro_x))
#         pro_x = self.dropout(pro_x)
#         pro_x = self.pro_fc_g2(pro_x)

#         # Concatenate molecule and protein features
#         xc = torch.cat((mol_x, pro_x), dim=1)
#         xc = self.fc1(xc)
#         xc = self.relu(xc)
#         xc = self.dropout(xc)
#         xc = self.fc2(xc)
#         xc = self.relu(xc)
#         xc = self.dropout(xc)
#         out = self.out(xc)

#         return out


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, global_max_pool as gmp, global_add_pool as gap,global_mean_pool as gep,global_sort_pool
from torch_geometric.utils import dropout_adj


# GCN based model
class GNNNet(torch.nn.Module):
    def __init__(self, n_output=1, num_features_pro=54, num_features_mol=78, output_dim=128, dropout=0.2):
        super(GNNNet, self).__init__()

        print('GNNNet Loaded')
        self.n_output = n_output
        self.mol_conv1 = GCNConv(num_features_mol, num_features_mol)
        self.mol_conv2 = GCNConv(num_features_mol, num_features_mol * 2)
        self.mol_conv3 = GCNConv(num_features_mol * 2, num_features_mol * 4)
        self.mol_fc_g1 = torch.nn.Linear(num_features_mol * 4, 1024)
        self.mol_fc_g2 = torch.nn.Linear(1024, output_dim)

        # self.pro_conv1 = GCNConv(embed_dim, embed_dim)
        self.pro_conv1 = GCNConv(num_features_pro, num_features_pro)
        self.pro_conv2 = GCNConv(num_features_pro, num_features_pro * 2)
        self.pro_conv3 = GCNConv(num_features_pro * 2, num_features_pro * 4)
        # self.pro_conv4 = GCNConv(embed_dim * 4, embed_dim * 8)
        self.pro_fc_g1 = torch.nn.Linear(num_features_pro * 4, 1024)
        self.pro_fc_g2 = torch.nn.Linear(1024, output_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

        # combined layers
        self.fc1 = nn.Linear(2 * output_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.out = nn.Linear(512, self.n_output)

    def forward(self, data_mol, data_pro):
        # get graph input
        mol_x, mol_edge_index, mol_batch = data_mol.x, data_mol.edge_index, data_mol.batch
        # get protein input
        target_x, target_edge_index, target_batch = data_pro.x, data_pro.edge_index, data_pro.batch

        # target_seq=data_pro.target

        # print('size')
        # print('mol_x', mol_x.size(), 'edge_index', mol_edge_index.size(), 'batch', mol_batch.size())
        # print('target_x', target_x.size(), 'target_edge_index', target_batch.size(), 'batch', target_batch.size())

        x = self.mol_conv1(mol_x, mol_edge_index)
        x = self.relu(x)

        # mol_edge_index, _ = dropout_adj(mol_edge_index, training=self.training)
        x = self.mol_conv2(x, mol_edge_index)
        x = self.relu(x)

        # mol_edge_index, _ = dropout_adj(mol_edge_index, training=self.training)
        x = self.mol_conv3(x, mol_edge_index)
        x = self.relu(x)
        x = gep(x, mol_batch)  # global pooling

        # flatten
        x = self.relu(self.mol_fc_g1(x))
        x = self.dropout(x)
        x = self.mol_fc_g2(x)
        x = self.dropout(x)

        xt = self.pro_conv1(target_x, target_edge_index)
        xt = self.relu(xt)

        # target_edge_index, _ = dropout_adj(target_edge_index, training=self.training)
        xt = self.pro_conv2(xt, target_edge_index)
        xt = self.relu(xt)

        # target_edge_index, _ = dropout_adj(target_edge_index, training=self.training)
        xt = self.pro_conv3(xt, target_edge_index)
        xt = self.relu(xt)

        # xt = self.pro_conv4(xt, target_edge_index)
        # xt = self.relu(xt)
        xt = gep(xt, target_batch)  # global pooling

        # flatten
        xt = self.relu(self.pro_fc_g1(xt))
        xt = self.dropout(xt)
        xt = self.pro_fc_g2(xt)
        xt = self.dropout(xt)

        # print(x.size(), xt.size())
        # concat
        xc = torch.cat((x, xt), 1)
        # add some dense layers
        xc = self.fc1(xc)
        xc = self.relu(xc)
        xc = self.dropout(xc)
        xc = self.fc2(xc)
        xc = self.relu(xc)
        xc = self.dropout(xc)
        out = self.out(xc)
        return out


In [3]:
# import os
# import torch
# import pickle
# import pandas as pd

# def load_graph(path, is_pickle=True):
#     """
#     Load a molecule graph (.pkl) or a protein graph (.pt).
#     If is_pickle is True, use pickle to load the file; otherwise, use torch.load.
#     """
#     if is_pickle:
#         with open(path, 'rb') as f:
#             return pickle.load(f)
#     else:
#         return torch.load(path)

# def prepare_dataset_incremental(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_file):
#     """
#     Incrementally prepares the dataset to avoid memory issues.
#     Processes one protein and its associated molecules at a time, and appends the results to the output file.
    
#     Args:
#     - filtered_dataset: The filtered KIBA dataset (DataFrame).
#     - molecule_graph_dir: Directory where molecule graphs are stored.
#     - protein_graph_dir: Directory where protein graphs are stored.
#     - output_file: File to save the prepared dataset incrementally.
#     """
#     current_protein = None
#     dataset = []
    
#     for index, row in filtered_dataset.iterrows():
#         protein_id = row['Target_ID']
#         chembl_id = row['Drug_ID']
        
#         # If the protein changes, save the dataset for the previous protein
#         if current_protein is not None and current_protein != protein_id:
#             with open(output_file, 'ab') as f:  # Append to the output file
#                 pickle.dump(dataset, f)
#             print(f"Processed and saved data for protein {current_protein}.")
#             dataset = []  # Reset dataset for the next protein
        
#         current_protein = protein_id
        
#         # Load the protein graph (.pt)
#         pro_graph_path = os.path.join(protein_graph_dir, f"{protein_id}_graph.pt")
#         if not os.path.exists(pro_graph_path):
#             print(f"Protein graph not found: {protein_id}")
#             continue
#         pro_graph = load_graph(pro_graph_path, is_pickle=False)
        
#         # Load the molecule graph (.pkl)
#         mol_graph_path = os.path.join(molecule_graph_dir, f"{chembl_id}_graph.pkl")
#         if not os.path.exists(mol_graph_path):
#             print(f"Molecule graph not found: {chembl_id}")
#             continue
#         mol_graph = load_graph(mol_graph_path)

#         # Load target (affinity value)
#         target = torch.tensor([row['Y']], dtype=torch.float)
        
#         # Append the (molecule, protein, target) tuple to the dataset
#         dataset.append((mol_graph, pro_graph, target))
    
#     # Save the last batch (for the final protein)
#     if len(dataset) > 0:
#         with open(output_file, 'ab') as f:
#             pickle.dump(dataset, f)
#         print(f"Processed and saved data for protein {current_protein}.")

# # Example usage for incremental dataset preparation
# molecule_graph_dir = 'molecule_graphs/'  # Directory where molecule graphs are stored
# protein_graph_dir = 'ProteinGraphs/'  # Directory where protein graphs are stored
# filtered_dataset_path = 'filtered_KibaDataSet.csv'  # Path to the filtered dataset CSV

# # Load filtered dataset CSV
# filtered_dataset = pd.read_csv(filtered_dataset_path)

# # Prepare the dataset incrementally, saving after each protein
# output_file = 'incremental_prepared_dataset.pkl'  # Output file to save dataset incrementally
# prepare_dataset_incremental(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_file)

# print("Dataset preparation completed.")


In [1]:
# import pickle

# def load_prepared_dataset(filepath):
#     """
#     Load the prepared dataset saved as a pickle file.
    
#     Args:
#     - filepath: Path to the file where the prepared dataset is saved.
    
#     Returns:
#     - dataset: The complete list of samples from the dataset.
#     """
#     dataset = []
    
#     with open(filepath, 'rb') as f:
#         while True:
#             try:
#                 # Load a batch of samples
#                 batch = pickle.load(f)
#                 dataset.extend(batch)  # Add the batch to the full dataset
#             except EOFError:
#                 # End of file reached
#                 break

#     return dataset

# # Example usage:
# prepared_dataset_path = 'incremental_prepared_dataset.pkl'  # Path to the saved dataset

# # Load the dataset
# loaded_dataset = load_prepared_dataset(prepared_dataset_path)

# # Print the total number of samples
# print(f"Total number of samples in the dataset: {len(loaded_dataset)}")


In [2]:
# import os
# import torch
# import pickle
# import pandas as pd
# from torch_geometric.data import Data

# def load_graph(path):
#     # Load graphs from .pkl for molecules and .pt for proteins
#     with open(path, 'rb') as f:
#         return pickle.load(f)

# def prepare_dataset(filtered_dataset, molecule_graph_dir, protein_graph_dir):
#     dataset = []
    
#     for index, row in filtered_dataset.iterrows():
#         # Load molecule graph based on Drug_ID
#         mol_graph_path = os.path.join(molecule_graph_dir, f"{row['Drug_ID']}_graph.pkl")
#         mol_graph = load_graph(mol_graph_path)

#         # Load protein graph based on Target_ID
#         pro_graph_path = os.path.join(protein_graph_dir, f"{row['Target_ID']}_graph.pt")
#         pro_graph = torch.load(pro_graph_path)
        
#         # Load target (affinity value)
#         target = torch.tensor([row['Y']], dtype=torch.float)
        
#         # Append tuple (mol_graph, pro_graph, target) to dataset
#         dataset.append((mol_graph, pro_graph, target))
    
#     return dataset

# # Example usage for dataset preparation
# molecule_graph_dir = 'molecule_graphs/'  # Directory where molecule graphs are stored
# protein_graph_dir = 'ProteinGraphs/'  # Directory where protein graphs are stored
# filtered_dataset_path = 'filtered_KibaDataSet.csv'  # Path to the filtered dataset CSV

# # Load filtered dataset CSV
# filtered_dataset = pd.read_csv(filtered_dataset_path)

# # Prepare the dataset with molecule, protein graphs, and affinity scores
# prepared_dataset = prepare_dataset(filtered_dataset, molecule_graph_dir, protein_graph_dir)

# # Save the prepared dataset for later usage
# torch.save(prepared_dataset, 'prepared_dataset.pt')

# print(f"Dataset prepared with {len(prepared_dataset)} samples.")


In [4]:
# from sklearn.model_selection import KFold
# from torch_geometric.data import DataLoader
# import torch.optim as optim
# from torch.nn import MSELoss

# def train_5fold_cross_validation(prepared_dataset, num_epochs=1000, n_splits=5, lr=0.001):
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     print(f"Running on {device}.")

#     kfold = KFold(n_splits=n_splits, shuffle=True)
    
#     results = []
#     loss_fn = MSELoss()
    
#     for fold, (train_idx, test_idx) in enumerate(kfold.split(prepared_dataset)):
#         print(f'Fold {fold + 1}/{n_splits}')
        
#         # Split the dataset into training and testing based on indices
#         train_data = [prepared_dataset[i] for i in train_idx]
#         test_data = [prepared_dataset[i] for i in test_idx]
        
#         # Create DataLoader for training and testing
#         train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
#         test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

#         # Initialize the GNN model
#         model = GNNNet().to(device)
#         optimizer = optim.Adam(model.parameters(), lr=lr)

#         for epoch in range(num_epochs):
#             model.train()
#             for mol_data, pro_data, target in train_loader:
#                 mol_data = mol_data.to(device)
#                 pro_data = pro_data.to(device)
#                 target = target.to(device)
                
#                 optimizer.zero_grad()
#                 output = model(mol_data, pro_data)
#                 loss = loss_fn(output, target)
#                 loss.backward()
#                 optimizer.step()

#             print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item()}")

#         # Evaluation on the test set
#         model.eval()
#         total_preds, total_labels = [], []
#         with torch.no_grad():
#             for mol_data, pro_data, target in test_loader:
#                 mol_data = mol_data.to(device)
#                 pro_data = pro_data.to(device)
#                 target = target.to(device)
                
#                 output = model(mol_data, pro_data)
#                 total_preds.append(output.cpu().numpy())
#                 total_labels.append(target.cpu().numpy())

#         mse = get_mse(total_labels, total_preds)
#         ci = get_ci(total_labels, total_preds)
#         pearson = get_pearson(total_labels, total_preds)
#         print(f"Fold {fold+1} - MSE: {mse}, CI: {ci}, Pearson: {pearson}")

#         # Store results for this fold
#         results.append((mse, ci, pearson))

#     return results

# # Load the prepared dataset
# prepared_dataset = torch.load('prepared_dataset.pt')

# # Run 5-fold cross-validation training with the pre-prepared dataset
# results = train_5fold_cross_validation(prepared_dataset)


In [7]:
# import os
# import torch

# def load_prepared_dataset(directory):
#     """
#     Load all individual samples from the directory where each sample is saved as a .pt file.
    
#     Parameters:
#     directory (str): Path to the directory containing the individual .pt files
    
#     Returns:
#     list: A list of loaded samples
#     """
#     dataset = []
    
#     # Iterate over all files in the directory
#     for filename in os.listdir(directory):
#         if filename.endswith('.pt'):
#             file_path = os.path.join(directory, filename)
#             sample = torch.load(file_path)
#             dataset.append(sample)
    
#     return dataset

# # Load the prepared dataset from individual .pt files
# prepared_samples_dir = 'prepared_samples'  # The directory containing individual sample .pt files
# prepared_dataset = load_prepared_dataset(prepared_samples_dir)

# # Check how many samples are loaded
# print(f"Loaded {len(prepared_dataset)} samples.")


In [17]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.optim as optim
from torch.nn import MSELoss
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import KFold

def get_mse(labels, preds):
    return mean_squared_error(labels, preds)

def get_pearson(labels, preds):
    return pearsonr(labels, preds)[0]

def get_ci(labels, preds):
    # Concordance Index (CI) implementation
    n = 0
    h_sum = 0
    for i in range(len(labels)):
        for j in range(i + 1, len(labels)):
            if labels[i] != labels[j]:
                n += 1
                if (preds[i] < preds[j] and labels[i] < labels[j]) or (preds[i] > preds[j] and labels[i] > labels[j]):
                    h_sum += 1
                elif preds[i] == preds[j]:
                    h_sum += 0.5
    return h_sum / n if n > 0 else 0.5

def load_sample(path):
    # Load individual sample from file
    sample = torch.load(path)
    mol_data = sample[0]
    pro_data = sample[1]
    target = sample[2]

    # Convert dictionaries to Data objects if necessary
    if isinstance(mol_data, dict):
        mol_data = Data(**mol_data)
    if isinstance(pro_data, dict):
        pro_data = Data(**pro_data)

    # Ensure that 'x' attribute is set
    if not hasattr(mol_data, 'x') or mol_data.x is None:
        if hasattr(mol_data, 'features'):
            mol_data.x = mol_data.features
            del mol_data.features
        else:
            raise ValueError("mol_data does not have 'x' or 'features' attribute")

    if not hasattr(pro_data, 'x') or pro_data.x is None:
        if hasattr(pro_data, 'features'):
            pro_data.x = pro_data.features
            del pro_data.features
        else:
            raise ValueError("pro_data does not have 'x' or 'features' attribute")

    # Ensure 'x' is a float tensor
    if not isinstance(mol_data.x, torch.Tensor):
        mol_data.x = torch.tensor(mol_data.x)
    if not isinstance(pro_data.x, torch.Tensor):
        pro_data.x = torch.tensor(pro_data.x)

    if mol_data.x.dtype != torch.float:
        mol_data.x = mol_data.x.float()
    if pro_data.x.dtype != torch.float:
        pro_data.x = pro_data.x.float()

    # **Adjust 'edge_index' for mol_data**
    # Ensure 'edge_index' is a tensor of type torch.long
    if not isinstance(mol_data.edge_index, torch.Tensor):
        mol_data.edge_index = torch.tensor(mol_data.edge_index, dtype=torch.long)
    else:
        mol_data.edge_index = mol_data.edge_index.long()

    # Ensure 'edge_index' has shape [2, num_edges]
    if mol_data.edge_index.shape[0] != 2:
        mol_data.edge_index = mol_data.edge_index.t()

    # **Adjust 'edge_index' for pro_data**
    if not isinstance(pro_data.edge_index, torch.Tensor):
        pro_data.edge_index = torch.tensor(pro_data.edge_index, dtype=torch.long)
    else:
        pro_data.edge_index = pro_data.edge_index.long()

    if pro_data.edge_index.shape[0] != 2:
        pro_data.edge_index = pro_data.edge_index.t()

    # Set 'num_nodes' attribute to suppress warnings
    mol_data.num_nodes = mol_data.x.size(0)
    pro_data.num_nodes = pro_data.x.size(0)

    return (mol_data, pro_data, target)


def batch_loader(sample_files, sample_dir, batch_size):
    for i in range(0, len(sample_files), batch_size):
        batch_files = sample_files[i:i + batch_size]
        batch_samples = []
        for file_name in batch_files:
            sample_path = os.path.join(sample_dir, file_name)
            batch_samples.append(load_sample(sample_path))
        yield batch_samples

def train_5fold_cross_validation(sample_dir, num_epochs=1000, n_splits=5, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Running on {device}.")

    sample_files = [f for f in os.listdir(sample_dir) if f.endswith('.pt')]
    kfold = KFold(n_splits=n_splits, shuffle=True)

    results = []
    loss_fn = MSELoss()

    for fold, (train_idx, test_idx) in enumerate(kfold.split(sample_files)):
        print(f'Fold {fold + 1}/{n_splits}')

        train_files = [sample_files[i] for i in train_idx]
        test_files = [sample_files[i] for i in test_idx]

        # Determine input feature dimensions from your data
        sample = load_sample(os.path.join(sample_dir, train_files[0]))
        mol_data = sample[0]
        pro_data = sample[1]

        num_features_mol = mol_data.x.size(1)  # Should be 78
        num_features_pro = pro_data.x.size(1)  # Should be 52

        # Initialize the GNN model with correct input dimensions
        model = GNNNet(
            num_features_mol=num_features_mol,
            num_features_pro=num_features_pro
        ).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0

            # Train in batches
            for batch_samples in batch_loader(train_files, sample_dir, batch_size=4):
                mol_data_list = []
                pro_data_list = []
                target_list = []

                for sample in batch_samples:
                    mol_data = sample[0]
                    pro_data = sample[1]
                    target = sample[2]

                    mol_data_list.append(mol_data)
                    pro_data_list.append(pro_data)
                    target_list.append(target)

                mol_batch = Batch.from_data_list(mol_data_list).to(device)
                pro_batch = Batch.from_data_list(pro_data_list).to(device)
                target = torch.tensor(target_list, dtype=torch.float32).to(device)

                optimizer.zero_grad()
                output = model(mol_batch, pro_batch)
                loss = loss_fn(output.view(-1), target)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * len(batch_samples)

            avg_loss = running_loss / len(train_files)
            print(f"Fold {fold+1}, Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

        # Evaluation on the test set
        model.eval()
        total_preds, total_labels = [], []
        with torch.no_grad():
            for batch_samples in batch_loader(test_files, sample_dir, batch_size=4):
                mol_data_list = []
                pro_data_list = []
                target_list = []

                for sample in batch_samples:
                    mol_data = sample[0]
                    pro_data = sample[1]
                    target = sample[2]

                    mol_data_list.append(mol_data)
                    pro_data_list.append(pro_data)
                    target_list.append(target)

                mol_batch = Batch.from_data_list(mol_data_list).to(device)
                pro_batch = Batch.from_data_list(pro_data_list).to(device)
                target = torch.tensor(target_list, dtype=torch.float32).to(device)

                output = model(mol_batch, pro_batch)
                total_preds.append(output.cpu().numpy())
                total_labels.append(target.cpu().numpy())

        # Convert lists to numpy arrays for evaluation
        total_preds = np.concatenate(total_preds)
        total_labels = np.concatenate(total_labels)

        mse = get_mse(total_labels, total_preds)
        ci = get_ci(total_labels, total_preds)
        pearson = get_pearson(total_labels, total_preds)
        print(f"Fold {fold+1} - MSE: {mse}, CI: {ci}, Pearson: {pearson}")

        # Store results for this fold
        results.append((mse, ci, pearson))

    return results



# Path to the directory containing prepared individual samples
sample_dir = 'prepared_samples'  # Adjust the path to where the .pt files are stored

# Place the debugging code here
if __name__ == "__main__":
    # Define your sample directory
    sample_dir = 'prepared_samples'  # Adjust this path as needed

    # List all sample files
    sample_files = [f for f in os.listdir(sample_dir) if f.endswith('.pt')]

    # Test the first sample (or multiple samples)
    sample_path = os.path.join(sample_dir, sample_files[0])
    mol_data, pro_data, target = load_sample(sample_path)

    print(f"mol_data.edge_index dtype: {mol_data.edge_index.dtype}")
    print(f"mol_data.edge_index shape: {mol_data.edge_index.shape}")
    print(f"pro_data.edge_index dtype: {pro_data.edge_index.dtype}")
    print(f"pro_data.edge_index shape: {pro_data.edge_index.shape}")

    # Optionally, check multiple samples
    for i in range(5):  # Adjust the range as needed
        sample_path = os.path.join(sample_dir, sample_files[i])
        mol_data, pro_data, target = load_sample(sample_path)
        print(f"Sample {i}:")
        print(f"  mol_data.edge_index dtype: {mol_data.edge_index.dtype}")
        print(f"  mol_data.edge_index shape: {mol_data.edge_index.shape}")
        print(f"  pro_data.edge_index dtype: {pro_data.edge_index.dtype}")
        print(f"  pro_data.edge_index shape: {pro_data.edge_index.shape}")


# Run 5-fold cross-validation training
results = train_5fold_cross_validation(sample_dir)


mol_data.edge_index dtype: torch.int64
mol_data.edge_index shape: torch.Size([2, 50])
pro_data.edge_index dtype: torch.int64
pro_data.edge_index shape: torch.Size([2, 5264])
Sample 0:
  mol_data.edge_index dtype: torch.int64
  mol_data.edge_index shape: torch.Size([2, 50])
  pro_data.edge_index dtype: torch.int64
  pro_data.edge_index shape: torch.Size([2, 5264])
Sample 1:
  mol_data.edge_index dtype: torch.int64
  mol_data.edge_index shape: torch.Size([2, 66])
  pro_data.edge_index dtype: torch.int64
  pro_data.edge_index shape: torch.Size([2, 2710])
Sample 2:
  mol_data.edge_index dtype: torch.int64
  mol_data.edge_index shape: torch.Size([2, 66])
  pro_data.edge_index dtype: torch.int64
  pro_data.edge_index shape: torch.Size([2, 3916])
Sample 3:
  mol_data.edge_index dtype: torch.int64
  mol_data.edge_index shape: torch.Size([2, 54])
  pro_data.edge_index dtype: torch.int64
  pro_data.edge_index shape: torch.Size([2, 5264])
Sample 4:
  mol_data.edge_index dtype: torch.int64
  mol_d

KeyboardInterrupt: 

In [17]:
import os
import torch
from torch_geometric.data import DataLoader
import torch.optim as optim
from torch.nn import MSELoss
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.model_selection import KFold

def get_mse(labels, preds):
    return mean_squared_error(labels, preds)

def get_pearson(labels, preds):
    return pearsonr(labels, preds)[0]

def get_ci(labels, preds):
    # Concordance Index (CI) implementation
    n = 0
    h_sum = 0
    for i in range(len(labels)):
        for j in range(i + 1, len(labels)):
            if labels[i] != labels[j]:
                n += 1
                if (preds[i] < preds[j] and labels[i] < labels[j]) or (preds[i] > preds[j] and labels[i] > labels[j]):
                    h_sum += 1
                elif preds[i] == preds[j]:
                    h_sum += 0.5
    return h_sum / n if n > 0 else 0.5

def load_sample(path):
    # Load individual sample from file
    return torch.load(path)

def batch_loader(sample_indices, sample_dir, batch_size=32):
    """
    Generator to load batches of samples from disk.

    Parameters:
    sample_indices (list): List of sample indices for the current fold.
    sample_dir (str): Directory where individual samples are saved.
    batch_size (int): Number of samples per batch.
    """
    for i in range(0, len(sample_indices), batch_size):
        batch_files = sample_indices[i:i + batch_size]
        batch_samples = []
        for file_name in batch_files:
            sample_path = os.path.join(sample_dir, file_name)
            batch_samples.append(load_sample(sample_path))
        yield batch_samples

def train_5fold_cross_validation(sample_dir, num_epochs=1000, n_splits=5, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Running on {device}.")

    sample_files = [f for f in os.listdir(sample_dir) if f.endswith('.pt')]
    kfold = KFold(n_splits=n_splits, shuffle=True)

    results = []
    loss_fn = MSELoss()

    for fold, (train_idx, test_idx) in enumerate(kfold.split(sample_files)):
        print(f'Fold {fold + 1}/{n_splits}')

        train_files = [sample_files[i] for i in train_idx]
        test_files = [sample_files[i] for i in test_idx]

        # Initialize the GNN model
        model = GNNNet().to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            
            # Train in batches
            for batch_samples in batch_loader(train_files, sample_dir, batch_size=32):
                mol_data_list, pro_data_list, target_list = zip(*batch_samples)

                mol_data = torch.stack(mol_data_list).to(device)
                pro_data = torch.stack(pro_data_list).to(device)
                target = torch.stack(target_list).to(device)

                optimizer.zero_grad()
                output = model(mol_data, pro_data)
                loss = loss_fn(output, target)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            print(f"Fold {fold+1}, Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_files)}")

        # Evaluation on the test set
        model.eval()
        total_preds, total_labels = [], []
        with torch.no_grad():
            for batch_samples in batch_loader(test_files, sample_dir, batch_size=32):
                mol_data_list, pro_data_list, target_list = zip(*batch_samples)

                mol_data = torch.stack(mol_data_list).to(device)
                pro_data = torch.stack(pro_data_list).to(device)
                target = torch.stack(target_list).to(device)

                output = model(mol_data, pro_data)
                total_preds.append(output.cpu().numpy())
                total_labels.append(target.cpu().numpy())

        # Convert lists to numpy arrays for evaluation
        total_preds = np.concatenate(total_preds)
        total_labels = np.concatenate(total_labels)

        mse = get_mse(total_labels, total_preds)
        ci = get_ci(total_labels, total_preds)
        pearson = get_pearson(total_labels, total_preds)
        print(f"Fold {fold+1} - MSE: {mse}, CI: {ci}, Pearson: {pearson}")

        # Store results for this fold
        results.append((mse, ci, pearson))

    return results

# Path to the directory containing prepared individual samples
sample_dir = 'prepared_samples'  # Adjust the path to where the .pt files are stored

# Run 5-fold cross-validation training
results = train_5fold_cross_validation(sample_dir)


Running on cpu.
Fold 1/5
GNNNet Loaded


TypeError: expected Tensor as element 0 in argument 0, but got dict

In [5]:
from sklearn.model_selection import KFold
from torch_geometric.data import DataLoader
import torch.optim as optim
from torch.nn import MSELoss
import torch
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

def get_mse(labels, preds):
    return mean_squared_error(labels, preds)

def get_pearson(labels, preds):
    return pearsonr(labels, preds)[0]

def get_ci(labels, preds):
    # Concordance Index (CI) implementation
    n = 0
    h_sum = 0
    for i in range(len(labels)):
        for j in range(i + 1, len(labels)):
            if labels[i] != labels[j]:
                n += 1
                if (preds[i] < preds[j] and labels[i] < labels[j]) or (preds[i] > preds[j] and labels[i] > labels[j]):
                    h_sum += 1
                elif preds[i] == preds[j]:
                    h_sum += 0.5
    return h_sum / n if n > 0 else 0.5

def train_5fold_cross_validation(prepared_dataset, num_epochs=1000, n_splits=5, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Running on {device}.")

    kfold = KFold(n_splits=n_splits, shuffle=True)

    results = []
    loss_fn = MSELoss()
    
    for fold, (train_idx, test_idx) in enumerate(kfold.split(prepared_dataset)):
        print(f'Fold {fold + 1}/{n_splits}')
        
        # Split the dataset into training and testing based on indices
        train_data = [prepared_dataset[i] for i in train_idx]
        test_data = [prepared_dataset[i] for i in test_idx]
        
        # Create DataLoader for training and testing
        train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
        test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

        # Initialize the GNN model
        model = GNNNet().to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            for batch_idx, (mol_data, pro_data, target) in enumerate(train_loader):
                mol_data = mol_data.to(device)
                pro_data = pro_data.to(device)
                target = target.to(device)
                
                optimizer.zero_grad()
                output = model(mol_data, pro_data)
                loss = loss_fn(output, target)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            print(f"Fold {fold+1}, Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")
            
            # Save model checkpoints after every few epochs if needed
            if (epoch + 1) % 100 == 0:
                torch.save(model.state_dict(), f"model_checkpoint_fold{fold+1}_epoch{epoch+1}.pt")

        # Evaluation on the test set
        model.eval()
        total_preds, total_labels = [], []
        with torch.no_grad():
            for mol_data, pro_data, target in test_loader:
                mol_data = mol_data.to(device)
                pro_data = pro_data.to(device)
                target = target.to(device)
                
                output = model(mol_data, pro_data)
                total_preds.append(output.cpu().numpy())
                total_labels.append(target.cpu().numpy())

        # Convert lists to numpy arrays for evaluation
        total_preds = np.concatenate(total_preds)
        total_labels = np.concatenate(total_labels)

        mse = get_mse(total_labels, total_preds)
        ci = get_ci(total_labels, total_preds)
        pearson = get_pearson(total_labels, total_preds)
        print(f"Fold {fold+1} - MSE: {mse}, CI: {ci}, Pearson: {pearson}")

        # Store results for this fold
        results.append((mse, ci, pearson))

    return results

# Load the prepared dataset
prepared_dataset = torch.load('prepared_samples')  # Adjust the path if you saved each sample individually

# Run 5-fold cross-validation training with the pre-prepared dataset
results = train_5fold_cross_validation(prepared_dataset)


IsADirectoryError: [Errno 21] Is a directory: 'prepared_samples'

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

def get_mse(labels, preds):
    return mean_squared_error(labels, preds)

def get_pearson(labels, preds):
    return pearsonr(labels, preds)[0]

def get_ci(labels, preds):
    # Concordance Index (CI) implementation
    n = 0
    h_sum = 0
    for i in range(len(labels)):
        for j in range(i + 1, len(labels)):
            if labels[i] != labels[j]:
                n += 1
                if (preds[i] < preds[j] and labels[i] < labels[j]) or (preds[i] > preds[j] and labels[i] > labels[j]):
                    h_sum += 1
                elif preds[i] == preds[j]:
                    h_sum += 0.5
    return h_sum / n if n > 0 else 0.5




In [3]:
from torch_geometric.data import DataLoader
from sklearn.model_selection import KFold
import torch.optim as optim
from torch.nn import MSELoss

def train_5fold_cross_validation(data, molecule_graphs, protein_graphs, num_epochs=1000, n_splits=5, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Print if the model is on GPU or CPU
    if torch.cuda.is_available():
        print("Model is running on GPU.")
    else:
        print("Model is running on CPU.")

    
    kfold = KFold(n_splits=n_splits, shuffle=True)
    
    # Prepare dataset and dataloaders
    results = []
    loss_fn = MSELoss()
    
    for fold, (train_idx, test_idx) in enumerate(kfold.split(data)):
        print(f'Fold {fold + 1}/{n_splits}')
        
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        train_loader = DataLoader(prepare_dataset(train_data, molecule_graphs, protein_graphs), batch_size=32, shuffle=True)
        test_loader = DataLoader(prepare_dataset(test_data, molecule_graphs, protein_graphs), batch_size=32, shuffle=False)

        # Initialize model and optimizer
        model = GNNNet().to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            model.train()
            for batch in train_loader:
                mol_data, pro_data, target = batch
                optimizer.zero_grad()
                output = model(mol_data.to(device), pro_data.to(device))
                loss = loss_fn(output, target.to(device))
                loss.backward()
                optimizer.step()

            print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item()}")

        # Evaluate on test set
        model.eval()
        total_preds, total_labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                mol_data, pro_data, target = batch
                output = model(mol_data.to(device), pro_data.to(device))
                total_preds.append(output.cpu().numpy())
                total_labels.append(target.cpu().numpy())

        mse = get_mse(total_labels, total_preds)
        ci = get_ci(total_labels, total_preds)
        pearson = get_pearson(total_labels, total_preds)
        print(f"Fold {fold+1} - MSE: {mse}, CI: {ci}, Pearson: {pearson}")

        # Save the results for this fold
        results.append((mse, ci, pearson))
    
    return results


In [4]:
import os
import torch
import pickle
from torch_geometric.data import Data

def load_graph(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    print("graph is loaded ")

def prepare_dataset(data, molecule_graphs, protein_graphs):
    dataset = []
    
    for index, row in data.iterrows():
        mol_graph_path = os.path.join(molecule_graphs, f"{row['Drug_ID']}_graph.pkl")
        pro_graph_path = os.path.join(protein_graphs, f"{row['Target_ID']}_graph.pt")
        
        mol_graph = load_graph(mol_graph_path)
        pro_graph = torch.load(pro_graph_path)
        target = torch.tensor([row['Y']], dtype=torch.float)
        
        dataset.append((mol_graph, pro_graph, target))

    print("Dataset is ready")
    
    return dataset


In [None]:
# Example usage
molecule_graphs = 'molecule_graphs/'
protein_graphs = 'ProteinGraphs/'
filtered_dataset_path = 'filtered_KibaDataSet.csv'

# Load filtered dataset
import pandas as pd
data = pd.read_csv(filtered_dataset_path)

# Run 5-fold cross-validation training
results = train_5fold_cross_validation(data, molecule_graphs, protein_graphs)


Model is running on GPU.
Fold 1/5


  pro_graph = torch.load(pro_graph_path)
