# PaMNet

Github: https://github.com/XieResearchGroup/Physics-aware-Multiplex-GNN

Place this file in the PaMNet repository, alongside the other primary files.

```bash
env_name=PaMNet
conda env list | cut -d' ' -f1 | grep -q ${env_name} && echo "${env_name} already installed" || conda create --name ${env_name} python=3.10 -y
conda activate ${env_name}
# install packages here?

```


In [None]:
# Download and install?
!git clone https://github.com/XieResearchGroup/Physics-aware-Multiplex-GNN
!cd Physics-aware-Multiplex-GNN

## Data Preprocessing

In [1]:
import os
import numpy as np
from tqdm import tqdm
from rdkit import Chem


def load_molecule(molecule_file):
    if ".mol2" in molecule_file:
        my_mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=True)
    elif ".sdf" in molecule_file:
        suppl = Chem.SDMolSupplier(str(molecule_file), sanitize=False, removeHs=True)
        my_mol = suppl[0]
    elif ".pdb" in molecule_file:
        my_mol = Chem.MolFromPDBFile(
            str(molecule_file), sanitize=False, removeHs=True)
    else:
        raise ValueError("Unrecognized file type for %s" % str(molecule_file))
    if my_mol is None:
        raise ValueError("Unable to read non None Molecule Object")
    xyz = get_xyz_from_mol(my_mol)
    return xyz, my_mol

def get_xyz_from_mol(mol):
    xyz = np.zeros((mol.GetNumAtoms(), 3))
    conf = mol.GetConformer()
    for i in range(conf.GetNumAtoms()):
        position = conf.GetAtomPosition(i)
        xyz[i, 0] = position.x
        xyz[i, 1] = position.y
        xyz[i, 2] = position.z
    return (xyz)

def get_rms(molecule_file):
    with open(molecule_file) as f:
        for line in f:
            if 'TER' in line:
                break
        for line in f:
            cont = line.split()
            if cont[0] == 'rms':
                break
    return float(cont[-1])

def construct_graphs(data_dir, save_dir, data_name, save_name):
    print("Preprocessing", data_name)

    data_dir_full = os.path.join(data_dir, data_name)
    save_dir_full = os.path.join(save_dir, save_name, "raw")

    if not os.path.exists(save_dir_full):
        os.makedirs(save_dir_full)
       
    name_list = [x for x in os.listdir(data_dir_full)]

    for file_name in [save_name + '_node_labels.txt', save_name + '_graph_indicator.txt', 
                save_name + '_node_attributes.txt', save_name + '_graph_labels.txt',
                save_name + '_graph_names.txt']:
        if os.path.isfile(os.path.join(save_dir_full, file_name)):
            os.remove(os.path.join(save_dir_full, file_name))

    for i in tqdm(range(len(name_list))):
        name = name_list[i]
        rna_file = os.path.join(data_dir_full, name)
        
        rna_coords, rna_mol = load_molecule(rna_file)
        rna_label = get_rms(rna_file)

        rna_x = list()
        for atom_id in rna_mol.GetAtoms():
            rna_x.append(atom_id.GetAtomicNum())

        x_indices = [i for i,x in enumerate(rna_x) if (x == 6 or x == 7 or x == 8)] 
        rna_x = np.array([rna_x[i] for i in x_indices])
        rna_pos = np.array(rna_coords[x_indices])

        types = {
            6: 0,   #C
            7: 1,   #N
            8: 2,   #O
        }

        rna_x = np.array([types[x] for x in rna_x])

        name = np.array(name).reshape(-1, 1)

        # Generate files for loading graphs
        indicator = np.ones((rna_x.shape[0], 1)) * (i + 1)

        with open(os.path.join(save_dir_full, save_name + '_graph_indicator.txt'),'ab') as f:
            np.savetxt(f, indicator, fmt='%i', delimiter=', ')
        f.close()
    
        with open(os.path.join(save_dir_full, save_name + '_node_labels.txt'),'ab') as f:
            np.savetxt(f, rna_x, fmt='%i', delimiter=', ')
        f.close()
  
        with open(os.path.join(save_dir_full, save_name + '_node_attributes.txt'),'ab') as f:
            np.savetxt(f, rna_pos, fmt='%.3f', delimiter=', ')
        f.close()
        
        with open(os.path.join(save_dir_full, save_name + '_graph_labels.txt'),'ab') as f:
            np.savetxt(f, [rna_label], fmt='%.3f', delimiter=', ')
        f.close()

        with open(os.path.join(save_dir_full, save_name + '_graph_names.txt'),'ab') as f:
            np.savetxt(f, name, fmt='%s', delimiter=', ')
        f.close()


def main():
    data_dir = os.path.join(".", "data", "RNA-Puzzles", "classics_train_val")
    save_dir = os.path.join(".", "data", "RNA-Puzzles")

    construct_graphs(data_dir, save_dir, "example_train", "train")
    construct_graphs(data_dir, save_dir, "example_val", "val")
    

if __name__ == "__main__":
    main()

Preprocessing example_train


100%|█████████████████████████████████████████████████████████████████████████████████████████| 14000/14000 [12:08<00:00, 19.21it/s]


Preprocessing example_val


100%|███████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [03:16<00:00, 20.34it/s]


## Model Training

Set parameters

In [2]:
import argparse

args = argparse.Namespace(
    gpu=0,
    seed=40,
    dataset='RNA-Puzzles',
    epochs=15,
    lr=1e-4,
    wd=0,
    n_layer=1,
    dim=16,
    batch_size=8,
    cutoff_l=2.6,
    cutoff_g=20.0,
    flow='target_to_source'
)

Change numpy.math to math in sbf.py before this code

In [2]:
import os
import os.path as osp
import argparse
import numpy as np
import pandas as pd
import random
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import DataLoader

from models import PAMNet, Config
from datasets import TUDataset

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

def test(model, loader, device):
    model.eval()

    pred_list = []
    y_list = []

    for data in loader:
        data = data.to(device)
        pred = model(data)
        pred_list += pred.reshape(-1).tolist()
        y_list += data.y.reshape(-1).tolist()

    pred = np.array(pred_list).reshape(-1,)
    pred = torch.tensor(pred).to(device)

    y = np.array(y_list).reshape(-1,)
    y = torch.tensor(y).to(device)

    loss = F.smooth_l1_loss(pred, y)
    return loss.item(), np.array(pred_list).reshape(-1,)

def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
    
    set_seed(args.seed)

    # Create dataset
    path = osp.join('.', 'data', args.dataset)
    train_dataset = TUDataset(path, name='train', use_node_attr=True).shuffle()
    val_dataset = TUDataset(path, name='val', use_node_attr=True)

    # Load dataset
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
    print("Data loaded!")

    config = Config(dataset=args.dataset, dim=args.dim, n_layer=args.n_layer, cutoff_l=args.cutoff_l, 
                    cutoff_g=args.cutoff_g, flow=args.flow)

    model = PAMNet(config).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd, amsgrad=False)
    
    print("Start training!")
    best_val_loss = None
    for epoch in range(args.epochs):
        model.train()

        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()

            output = model(data)
            loss = F.smooth_l1_loss(output, data.y)
            loss.backward()
            optimizer.step()
        
        train_loss, _ = test(model, train_loader, device)
        val_loss, _ = test(model, val_loader, device)

        print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}'.format(epoch+1, train_loss, val_loss))
        
        save_folder = os.path.join(".", "save")
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)

        if best_val_loss is None or val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(save_folder, "pamnet_rna_best.pt"))

# Create args namespace with your specified parameters
args = argparse.Namespace(
    gpu=0,
    seed=40,
    dataset='RNA-Puzzles',
    epochs=15,
    lr=1e-4,
    wd=0,
    n_layer=1,
    dim=16,
    batch_size=8,
    cutoff_l=2.6,
    cutoff_g=20.0,
    flow='target_to_source'
)

# Run the main function
main(args)

  self.data, self.slices = torch.load(self.processed_paths[0])


Data loaded!
Start training!


Please either pass the dim explicitly or simply use torch.linalg.cross.
The default value of dim will change to agree with that of linalg.cross in a future release. (Triggered internally at C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\Cross.cpp:66.)
  b = torch.cross(pos_ji, pos_kj).norm(dim=-1)


KeyboardInterrupt: 

The code execution was interrupted due to exceeding the allowed time limit.

## Run Inference

In [5]:
import os
import os.path as osp
import argparse
import numpy as np
import pandas as pd
import random
import torch
from torch_geometric.data import DataLoader

from models import PAMNet, Config
from datasets import TUDataset

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
    
    set_seed(args.seed)

    # Create dataset
    path = osp.join('.', 'data', 'RNA-Puzzles')
    test_dataset = TUDataset(path, name=args.dataset, use_node_attr=True)

    # Load dataset
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
    print("Data loaded!")

    config = Config(dataset=args.dataset, dim=args.dim, n_layer=args.n_layer, cutoff_l=args.cutoff_l, 
                    cutoff_g=args.cutoff_g, flow=args.flow)

    model = PAMNet(config).to(device)
    model.load_state_dict(torch.load("./save/" + args.saved_model, map_location=device))
    model.eval()
    
    print("Model loaded. Start prediction!")
    y_hat_list = []
    df = pd.DataFrame()

    for data in test_loader:
        data = data.to(device)
        output = model(data)
        y_hat_list += output.reshape(-1).tolist()

    y_hat = np.array(y_hat_list).reshape(-1,)

    name_list = np.loadtxt(osp.join('.', 'data', 'RNA-Puzzles', args.dataset, 'raw', args.dataset + '_graph_names.txt'), dtype=str, converters = {0: lambda s: s[:-4]})

    df['PAMNet'] = y_hat
    df['tag'] = name_list
    df['puzzle_number'] = args.dataset[5:]

    # Print predictions with corresponding names
    print("\nPredictions:")
    print("=" * 50)
    print(f"{'Structure Name':<30} {'Prediction':<10}")
    print("-" * 50)
    for idx, row in df.iterrows():
        print(f"{row['tag']:<30} {row['PAMNet']:>.4f}")
    print("=" * 50)
    
    # Print summary statistics
    print("\nSummary Statistics:")
    print(f"Mean prediction: {df['PAMNet'].mean():.4f}")
    print(f"Min prediction: {df['PAMNet'].min():.4f}")
    print(f"Max prediction: {df['PAMNet'].max():.4f}")
    print(f"Total structures evaluated: {len(df)}")

    if not os.path.exists(osp.join('.', 'rna_puzzles_predictions')):
        os.makedirs(osp.join('.', 'rna_puzzles_predictions'))

    file_name = osp.join('.', 'rna_puzzles_predictions', args.dataset + '.csv')
    df.to_csv(file_name, sep=',', index=False)
    
    print("\nPrediction saved to:", file_name)
    
    return df  # Return the dataframe for further analysis if needed

# Create args namespace with your specified parameters
args = argparse.Namespace(
    gpu=0,
    seed=40,
    dataset='rna_native',
    epochs=150,
    lr=1e-4,
    wd=0,
    n_layer=1,
    dim=16,
    batch_size=16,
    cutoff_l=2.6,
    cutoff_g=20.0,
    flow='target_to_source',
    saved_model='pamnet_rna.pt'
)

# Run the main function and store the results
predictions_df = main(args)

  self.data, self.slices = torch.load(self.processed_paths[0])


Data loaded!


  model.load_state_dict(torch.load("./save/" + args.saved_model, map_location=device))


Model loaded. Start prediction!

Predictions:
Structure Name                 Prediction
--------------------------------------------------
rna_puzzle_20_NATIVE_5y87_RNA  2.6660
rna_puzzle_14_free_NATIVE_14_5ddo_free_solution_rpr 3.6808
rna_puzzle_14_bound_NATIVE_14_5ddp_bound_solution_rpr 3.6472
rna_puzzle_10_NATIVE_10_0_solution_4LCK_rpr 3.2907
rna_puzzle_1_NATIVE_1_solution_0_rpr 2.2482
rna_puzzle_7_NATIVE_7_0_solution_4r4v_rpr 3.3068
rna_puzzle_21_NATIVE_21_5nwq_solution_0_rpr 2.9127
rna_puzzle_6_NATIVE_6_0_solution_4GXY_rpr 3.5250
rna_puzzle_19_NATIVE_19_5t5a_solution_0_rpr 2.6696
rna_puzzle_15_NATIVE_15_solution_0_rpr 3.2712
rna_puzzle_12_NATIVE_12_4qln_solution_rpr 2.9578
rna_puzzle_17_NATIVE_17_5k7c_solution_rpr 3.2965
rna_puzzle_4_with_3IQP_NATIVE_4_0_solution_3V7E_rpr 3.3117
rna_puzzle_2_NATIVE_hacked_on  3.4877
rna_puzzle_18_with_4PQV_NATIVE_18_0_solution_5TPY_rpr 3.1077
rna_puzzle_9_2xnw_NATIVE_5kpy  3.2752
rna_puzzle_3_NATIVE_3_solution_0_rpr 2.8389
rna_puzzle_11_NATIVE    