In [1]:
!pip install torch_geometric pandas numpy matplotlib



In [6]:
import sys
import os
import torch
import numpy as np
import pandas as pd

# Need to download Dataset.py from GEMS directory locally ontop vm first
# git clone http://github.com/camlab-ethz/GEMS.git
# Define root path of cloned GEMS repo
GEMS_REPO_ROOT = os.path.expanduser('~/GEMS')

# Add the directory to Python's search path.
if GEMS_REPO_ROOT not in sys.path:
    sys.path.append(GEMS_REPO_ROOT)
    print(f"Added {GEMS_REPO_ROOT} to sys.path")

# Import necessary classes: the custom Dataset class and PyG's Data class
try: 
    from Dataset import Dataset
    from torch_geometric.data import Data as PyGData
    print("Successfully imported custom Dataset class")
except ImportError as e:
    print(f"Error importing GEMS Dataset: {e}")
    
#from torch_geometric.data import Data, Dataset


Added /home/jupyter/GEMS to sys.path
Successfully imported custom Dataset class


In [7]:
# Define the directory path where Dataset.py is located
DATA_PATH="/home/jupyter/gcs_pdbbind_mount/00AEPL_casf2013.pt"

try:
    data_list = torch.load(DATA_PATH)
    print(f"Loaded {len(data_list)} complexes")
    print(f"Example data object type: {type(data_list[0])}")
    print(f"Number of features: {data_list[0].x.size(1)}")

except Exception as e:
    print(f"Error loading data: {e}.")

Loaded 194 complexes
Example data object type: <class 'torch_geometric.data.data.Data'>
Number of features: 60


In [8]:
metrics = []
for i, data in enumerate(data_list):
    # Extract Target and ID
    # Affinity is the y label
    affinity = data.y.item()
    pdb_id = data.id if hasattr(data, 'id') else f'Complex_{i}'
    
    # Extract Graph Size Metrics
    # node features (x): shape is [num_atoms, num_features]
    num_nodes = data.x.size(0)
    
    # Edge index: shape is: [2, num_edges]
    num_edges = data.edge_index.size(1)
    
    # Extract Node/Edge features
    num_feat_10 = torch.sum(data.x[:, 9]).item()
    
    # Append the aggregated features to the list
    metrics.append({
        'PDB_ID': pdb_id,
        'Affinity_pKi_pKd': affinity,
        'Num_Atoms': num_nodes,
        'Num_Interactions': num_edges,
        'Density': num_edges / num_nodes,
        'Count_Feature_10': num_feat_10
    })
    
df = pd.DataFrame(metrics)

print('DataFrame created successfully')
print(df.head())

DataFrame created successfully
  PDB_ID  Affinity_pKi_pKd  Num_Atoms  Num_Interactions   Density  \
0   3f3c          0.376250         32               278  8.687500   
1   1w3l          0.392500         51               431  8.450980   
2   2hb1          0.237500         33               269  8.151515   
3   2v00          0.228750         32               252  7.875000   
4   1os0          0.376875         60               520  8.666667   

   Count_Feature_10  
0               6.0  
1              18.0  
2               5.0  
3              12.0  
4              18.0  
