In [3]:
# import os
# import torch
# import pickle
# import pandas as pd

# def load_graph(path, is_pickle=True):
#     """
#     Load a molecule graph (.pkl) or a protein graph (.pt).
#     If is_pickle is True, use pickle to load the file; otherwise, use torch.load.
#     """
#     if is_pickle:
#         with open(path, 'rb') as f:
#             return pickle.load(f)
#     else:
#         return torch.load(path)

# def prepare_dataset_incremental(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_file):
#     """
#     Incrementally prepares the dataset to avoid memory issues.
#     Processes one protein and its associated molecules at a time, and appends the results to the output file.
    
#     Args:
#     - filtered_dataset: The filtered KIBA dataset (DataFrame).
#     - molecule_graph_dir: Directory where molecule graphs are stored.
#     - protein_graph_dir: Directory where protein graphs are stored.
#     - output_file: File to save the prepared dataset incrementally.
#     """
#     current_protein = None
#     dataset = []
    
#     for index, row in filtered_dataset.iterrows():
#         protein_id = row['Target_ID']
#         chembl_id = row['Drug_ID']
        
#         # If the protein changes, save the dataset for the previous protein
#         if current_protein is not None and current_protein != protein_id:
#             with open(output_file, 'ab') as f:  # Append to the output file
#                 pickle.dump(dataset, f)
#             print(f"Processed and saved data for protein {current_protein}.")
#             dataset = []  # Reset dataset for the next protein
        
#         current_protein = protein_id
        
#         # Load the protein graph (.pt)
#         pro_graph_path = os.path.join(protein_graph_dir, f"{protein_id}_graph.pt")
#         if not os.path.exists(pro_graph_path):
#             print(f"Protein graph not found: {protein_id}")
#             continue
#         pro_graph = load_graph(pro_graph_path, is_pickle=False)
        
#         # Load the molecule graph (.pkl)
#         mol_graph_path = os.path.join(molecule_graph_dir, f"{chembl_id}_graph.pkl")
#         if not os.path.exists(mol_graph_path):
#             print(f"Molecule graph not found: {chembl_id}")
#             continue
#         mol_graph = load_graph(mol_graph_path)

#         # Load target (affinity value)
#         target = torch.tensor([row['Y']], dtype=torch.float)
        
#         # Append the (molecule, protein, target) tuple to the dataset
#         dataset.append((mol_graph, pro_graph, target))
    
#     # Save the last batch (for the final protein)
#     if len(dataset) > 0:
#         with open(output_file, 'ab') as f:
#             pickle.dump(dataset, f)
#         print(f"Processed and saved data for protein {current_protein}.")

# # Example usage for incremental dataset preparation
# molecule_graph_dir = 'molecule_graphs/'  # Directory where molecule graphs are stored
# protein_graph_dir = 'ProteinGraphs/'  # Directory where protein graphs are stored
# filtered_dataset_path = 'filtered_KibaDataSet.csv'  # Path to the filtered dataset CSV

# # Load filtered dataset CSV
# filtered_dataset = pd.read_csv(filtered_dataset_path)

# # Prepare the dataset incrementally, saving after each protein
# output_file = 'incremental_prepared_dataset.pkl'  # Output file to save dataset incrementally
# prepare_dataset_incremental(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_file)

# print("Dataset preparation completed.")


In [5]:
# import os
# import torch
# import pickle
# import pandas as pd

# def load_graph(path, is_pickle=True):
#     """
#     Load a molecule graph (.pkl) or a protein graph (.pt).
#     If is_pickle is True, use pickle to load the file; otherwise, use torch.load.
#     """
#     if is_pickle:
#         with open(path, 'rb') as f:
#             return pickle.load(f)
#     else:
#         return torch.load(path)

# def prepare_dataset_individual_save(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_dir):
#     """
#     Incrementally prepares the dataset and saves each (molecule, protein, target) tuple as a separate file.
    
#     Args:
#     - filtered_dataset: The filtered KIBA dataset (DataFrame).
#     - molecule_graph_dir: Directory where molecule graphs are stored.
#     - protein_graph_dir: Directory where protein graphs are stored.
#     - output_dir: Directory to save the prepared dataset incrementally.
#     """
#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)
    
#     for index, row in filtered_dataset.iterrows():
#         protein_id = row['Target_ID']
#         chembl_id = row['Drug_ID']
        
#         # Load the protein graph (.pt)
#         pro_graph_path = os.path.join(protein_graph_dir, f"{protein_id}_graph.pt")
#         if not os.path.exists(pro_graph_path):
#             print(f"Protein graph not found: {protein_id}")
#             continue
#         pro_graph = load_graph(pro_graph_path, is_pickle=False)
        
#         # Load the molecule graph (.pkl)
#         mol_graph_path = os.path.join(molecule_graph_dir, f"{chembl_id}_graph.pkl")
#         if not os.path.exists(mol_graph_path):
#             print(f"Molecule graph not found: {chembl_id}")
#             continue
#         mol_graph = load_graph(mol_graph_path)

#         # Load target (affinity value)
#         target = torch.tensor([row['Y']], dtype=torch.float)
        
#         # Create the sample as a tuple (molecule graph, protein graph, target)
#         sample = (mol_graph, pro_graph, target)
        
#         # Save the sample as a separate file
#         sample_path = os.path.join(output_dir, f"sample_{index}.pkl")
#         with open(sample_path, 'wb') as f:
#             pickle.dump(sample, f)

#         print(f"Saved sample {index} as {sample_path}")

# # Example usage for individual saving
# molecule_graph_dir = 'molecule_graphs/'  # Directory where molecule graphs are stored
# protein_graph_dir = 'ProteinGraphs/'  # Directory where protein graphs are stored
# filtered_dataset_path = 'filtered_KibaDataSet.csv'  # Path to the filtered dataset CSV
# output_dir = 'prepared_samples/'  # Directory to save individual samples

# # Load filtered dataset CSV
# filtered_dataset = pd.read_csv(filtered_dataset_path)

# # Prepare the dataset incrementally, saving each sample individually
# prepare_dataset_individual_save(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_dir)

# print("Dataset preparation completed.")


In [3]:
import os
import random
import torch
from torch_geometric.data import Data

# Define your sample directory
sample_dir = 'prepared_samples'  # Adjust this path as needed

# Function to load a sample
def load_sample(path):
    sample = torch.load(path)
    mol_data = sample[0]
    pro_data = sample[1]
    target = sample[2]

    # Convert dictionaries to Data objects if necessary
    if isinstance(mol_data, dict):
        mol_data = Data(**mol_data)
    if isinstance(pro_data, dict):
        pro_data = Data(**pro_data)

    # Ensure that 'x' attribute is set
    if not hasattr(mol_data, 'x') or mol_data.x is None:
        if hasattr(mol_data, 'features'):
            mol_data.x = mol_data.features
        else:
            print(f"Sample at {path} mol_data has no 'x' or 'features' attribute.")
    if not hasattr(pro_data, 'x') or pro_data.x is None:
        if hasattr(pro_data, 'features'):
            pro_data.x = pro_data.features
        else:
            print(f"Sample at {path} pro_data has no 'x' or 'features' attribute.")

    return mol_data, pro_data, target

# List all sample files
sample_files = [f for f in os.listdir(sample_dir) if f.endswith('.pt')]

# Randomly select 10 samples
random_samples = random.sample(sample_files, 10)

# Iterate over the random samples and print feature information
for idx, file_name in enumerate(random_samples):
    sample_path = os.path.join(sample_dir, file_name)
    mol_data, pro_data, target = load_sample(sample_path)
    print(f"Sample {idx+1}: {file_name}")
    print(f"  Target value: {target}")

    # Molecule features
    if hasattr(mol_data, 'x') and mol_data.x is not None:
        print(f"  Molecule data 'x' shape: {mol_data.x.shape}")
        print(f"  Molecule data 'x' features (first 5 nodes):\n{mol_data.x[:5]}")
    else:
        print("  Molecule data has no 'x' attribute.")

    # Protein features
    if hasattr(pro_data, 'x') and pro_data.x is not None:
        print(f"  Protein data 'x' shape: {pro_data.x.shape}")
        print(f"  Protein data 'x' features (first 5 nodes):\n{pro_data.x[:5]}")
    else:
        print("  Protein data has no 'x' attribute.")
    
    print("-" * 80)


Sample 1: sample_2154.pt
  Target value: tensor([11.3816])
  Molecule data 'x' shape: (32, 78)
  Molecule data 'x' features (first 5 nodes):
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 1]]
  Protein data 'x' shape: torch.Size([591, 

  sample = torch.load(path)


In [None]:
import os
import torch
import pickle
import pandas as pd

def load_graph(path, is_pickle=True):
    """
    Load a molecule graph (.pkl) or a protein graph (.pt).
    If is_pickle is True, use pickle to load the file; otherwise, use torch.load.
    """
    if is_pickle:
        with open(path, 'rb') as f:
            return pickle.load(f)
    else:
        return torch.load(path)

def prepare_dataset_individual_save_as_pt(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_dir):
    """
    Incrementally prepares the dataset and saves each (molecule, protein, target) tuple as a separate .pt file.
    
    Args:
    - filtered_dataset: The filtered KIBA dataset (DataFrame).
    - molecule_graph_dir: Directory where molecule graphs are stored.
    - protein_graph_dir: Directory where protein graphs are stored.
    - output_dir: Directory to save the prepared dataset incrementally.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for index, row in filtered_dataset.iterrows():
        protein_id = row['Target_ID']
        chembl_id = row['Drug_ID']
        
        # Load the protein graph (.pt)
        pro_graph_path = os.path.join(protein_graph_dir, f"{protein_id}_graph.pt")
        if not os.path.exists(pro_graph_path):
            print(f"Protein graph not found: {protein_id}")
            continue
        pro_graph = load_graph(pro_graph_path, is_pickle=False)
        
        # Load the molecule graph (.pkl)
        mol_graph_path = os.path.join(molecule_graph_dir, f"{chembl_id}_graph.pkl")
        if not os.path.exists(mol_graph_path):
            print(f"Molecule graph not found: {chembl_id}")
            continue
        mol_graph = load_graph(mol_graph_path)

        # Load target (affinity value)
        target = torch.tensor([row['Y']], dtype=torch.float)
        
        # Create the sample as a tuple (molecule graph, protein graph, target)
        sample = (mol_graph, pro_graph, target)
        
        # Save the sample as a .pt file
        sample_path = os.path.join(output_dir, f"sample_{index}.pt")
        torch.save(sample, sample_path)

        print(f"Saved sample {index} as {sample_path}")

# Example usage for individual saving
molecule_graph_dir = 'molecule_graphs/'  # Directory where molecule graphs are stored
protein_graph_dir = 'ProteinGraphs/'  # Directory where protein graphs are stored
filtered_dataset_path = 'filtered_KibaDataSet.csv'  # Path to the filtered dataset CSV
output_dir = 'prepared_samples/'  # Directory to save individual samples

# Load filtered dataset CSV
filtered_dataset = pd.read_csv(filtered_dataset_path)

# Prepare the dataset incrementally, saving each sample as a .pt file
prepare_dataset_individual_save_as_pt(filtered_dataset, molecule_graph_dir, protein_graph_dir, output_dir)

print("Dataset preparation completed.")


  return torch.load(path)


Saved sample 0 as prepared_samples/sample_0.pt
Saved sample 1 as prepared_samples/sample_1.pt
Saved sample 2 as prepared_samples/sample_2.pt
Saved sample 3 as prepared_samples/sample_3.pt
Saved sample 4 as prepared_samples/sample_4.pt
Saved sample 5 as prepared_samples/sample_5.pt
Saved sample 6 as prepared_samples/sample_6.pt
Saved sample 7 as prepared_samples/sample_7.pt
Saved sample 8 as prepared_samples/sample_8.pt
Saved sample 9 as prepared_samples/sample_9.pt
Saved sample 10 as prepared_samples/sample_10.pt
Saved sample 11 as prepared_samples/sample_11.pt
Saved sample 12 as prepared_samples/sample_12.pt
Saved sample 13 as prepared_samples/sample_13.pt
Saved sample 14 as prepared_samples/sample_14.pt
Saved sample 15 as prepared_samples/sample_15.pt
Saved sample 16 as prepared_samples/sample_16.pt
Saved sample 17 as prepared_samples/sample_17.pt
Saved sample 18 as prepared_samples/sample_18.pt
Saved sample 19 as prepared_samples/sample_19.pt
Saved sample 20 as prepared_samples/samp