In [1]:
import os
import numpy as np
import lmdb
import pickle
from pathlib import Path

from copy import deepcopy


import dgl
from dgl import heterograph
from dgl import DGLGraph

import torch
from torch.utils.data import Dataset
import pytorch_lightning as pl

from bondnet.data.reaction_network import ReactionNetworkLMDB, ReactionLMDB
from bondnet.data.utils import construct_rxn_graph_empty, create_rxn_graph
from bondnet.model.training_utils import load_model_lightning
from bondnet.test_utils import get_defaults
from bondnet.data.dataset import ReactionDatasetLMDBDataset, ReactionNetworkLMDBDataset, LmdbMoleculeDataset, LmdbReactionDataset
from bondnet.data.dataloader import DataLoaderReactionNetworkLMDB, DataLoaderReactionLMDB, collate_parallel_lmdb, collate_parallel_lmdb_network, collate_parallel_lmdb_test
from bondnet.data.lmdb import TransformMol

In [2]:
env = lmdb.open(
            str("/home/santiagovargas/dev/bondnet/bondnet/scripts/helpers/test_rapter/reaction.lmdb"),
            subdir=False,
            readonly=False,
            lock=False,
            readahead=True,
            meminit=False,
            max_readers=1,
    )

length_entry = env.begin().get("length".encode("ascii"))
            

In [3]:
env = lmdb.open(
            str("/home/santiagovargas/dev/bondnet/bondnet/scripts/helpers/test_rapter/molecule.lmdb"),
            subdir=False,
            readonly=False,
            lock=False,
            readahead=True,
            meminit=False,
            max_readers=1,
    )

In [4]:
length_entry = env.begin().get("length".encode("ascii"))


In [5]:
elements = env.begin().get("elements".encode("ascii"))
pickle.loads(elements)

{'C', 'Cl', 'F', 'H', 'Li', 'Mg', 'N', 'O', 'P', 'S'}

In [6]:
config = {
    "src": "/home/santiagovargas/dev/bondnet/bondnet/scripts/helpers/test_rapter/molecule.lmdb"
}
mol = LmdbMoleculeDataset(config=config, transform=TransformMol)

config = {
    "src": "/home/santiagovargas/dev/bondnet/bondnet/scripts/helpers/test_rapter/reaction.lmdb"
}
reaction = LmdbReactionDataset(config=config)


In [7]:
reaction.feature_name

{'atom': ['total_degree',
  'total_H',
  'is_in_ring',
  'ring_size_3',
  'ring_size_4',
  'ring_size_5',
  'ring_size_6',
  'ring_size_7',
  'chemical_symbol_C',
  'chemical_symbol_F',
  'chemical_symbol_H',
  'chemical_symbol_N',
  'chemical_symbol_O',
  'chemical_symbol_Mg',
  'chemical_symbol_Li',
  'chemical_symbol_S',
  'chemical_symbol_Cl',
  'chemical_symbol_P',
  'chemical_symbol_O',
  'chemical_symbol_Br'],
 'bond': ['metal bond',
  'ring inclusion',
  'ring size_3',
  'ring size_4',
  'ring size_5',
  'ring size_6',
  'ring size_7',
  'bond_length'],
 'global': ['num atoms', 'num bonds', 'molecule weight']}

In [8]:
#rxn_ntwk.reactions.feature_size

In [9]:
#rxn_ntwk.reactions.feature_name["bond"]

In [10]:
#print features
#[print(rxn_ntwk.molecules.__getitem__(i)["molecule_graph"].ndata["ft"]["global"][0][-5:]) for i in range(10)]

In [11]:
#rxn_ntwk.molecules.__getitem__(0)["molecule_graph"]

In [12]:
#rxn_ntwk.reactions[0]

In [13]:
rxn_ntwk = ReactionLMDB(mol, reaction)
dataset = ReactionDatasetLMDBDataset(rxn_ntwk)

In [14]:
#rxn_ntwk.reactions[0]

In [29]:
dataloader = DataLoaderReactionLMDB(
    dataset, batch_size=100, shuffle=True, collate_fn=collate_parallel_lmdb_test
)

In [30]:
#dataset.reactions[0]["reaction_molecule_info"]["reactants"]

In [31]:
#dataset.__getitem__(0)[0]["reaction_molecule_info"]#["reactants"]

In [32]:
sample = next(iter(dataloader))
#print((sample[1]["reaction"][0]["reaction_molecule_info"]["reactants"].keys()))

In [33]:
len(sample)

2

In [34]:


config = get_defaults()

config = {
    "model": {
        "extra_features": [],
        "extra_info": [],
        "debug": False,
        "classifier": False,
        "classif_categories": 3,
        "filter_species": [3, 6],
        "filter_outliers": False,
        "filter_sparse_rxns": False,
        "restore": False,
    },
    "optim": {
        "val_size": 0.1,
        "test_size": 0.1,
        "batch_size": 4,
        "num_workers": 1,
    },
}

dataset_loc = "../../../tests/data/testdata/barrier_100.json"
config = {
    "dataset": {
        "data_dir": dataset_loc,
        "target_var": "ts",
    },
    "model": {
        "extra_features": [],
        "extra_info": [],
        "debug": False,
        "classifier": False,
        "classif_categories": 3,
        "filter_species": [3, 6],
        "filter_outliers": False,
        "filter_sparse_rxns": False,
        "restore": False,
    },
    "optim": {
        "val_size": 0.2,
        "test_size": 0.2,
        "batch_size": 4,
        "num_workers": 1,
    },
}
config_model = get_defaults()
# update config with model settings
for key, value in config_model["model"].items():
    config["model"][key] = value
for key, value in config_model["model"].items():
    config["model"][key] = value
    
#from bondnet.data.datamodule import BondNetLightningDataModule
#dm = BondNetLightningDataModule(config)
# feat_size, feat_name = dm.prepare_data()
# config["model"]["in_feats"] = feat_size
# config["model"]["in_feats"] = feat_size
# config = get_defaults()
#config["model"]["in_feats"] = dataset.feature_info["feature_size"]
#reaction = dataset.reaction_network.reactions[0]
config["model"]["in_feats"] = reaction.feature_size
model = load_model_lightning(config["model"], load_dir="./test_lmdb/")

NB: using GatedGCNConv
NB: using Set2SetThenCat
:::NO INITIALIZER USED:::


In [35]:

device = "cuda" if torch.cuda.is_available() else "cpu"
nodes = ["atom", "bond", "global"]
for it, batch in enumerate(dataloader):
    print(it)
    batched_graph, label = batch
    nodes = ["atom", "bond", "global"]
    feats = {nt: batched_graph.nodes[nt].data["ft"] for nt in nodes}
    target = label["value"].view(-1)
    target_aug = label["value_rev"].view(-1)
    empty_aug = torch.isnan(target_aug).tolist()
    empty_aug = True in empty_aug
    norm_atom = label["norm_atom"]
    norm_bond = label["norm_bond"]
    stdev = label["scaler_stdev"]
    mean = label["scaler_mean"]
    reactions = label["reaction"]

    if model.stdev is None:
        model.stdev = stdev[0]
    # stdev = stdev.to(device)

    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"].keys())
    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"]["num_bonds_total"])
    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"]["num_atoms_total"])
    
    
    #for nt, ft in feats.items():
        #print(nt, ft.shape)
        #batched_graph.nodes[nt].data.update({"ft": ft})
    #print(batched_graph)
    graphs = dgl.unbatch(batched_graph)
    #print("atom")
    #[print(g.number_of_nodes("atom")) for g in graphs]
    #print("bond")
    #[print(g.number_of_nodes("bond")) for g in graphs]
    #reaction[0]
    #print(reaction[0]["reaction_graph"])
    #print(reaction[0]["reaction_feature"])

    #print(reactions[0]["mappings"])
    #print(reactions[0]["reaction_graph"])
    #print(reactions[0]["reaction_feature"])


    
    model(
        graph=batched_graph,
        feats=feats,
        reactions=reactions,
        norm_atom=norm_atom,
        norm_bond=norm_bond,
        reverse=False,
    )
    """
    for ind, rxn in enumerate(reactions):
        #print("rxn {}".format(ind))
        #print(rxn["reaction_molecule_info"])

        reactants = [
            graphs[i] for i in rxn["reaction_molecule_info"]["reactants"]["reactants"]
        ]
        products = [
            graphs[i] for i in rxn["reaction_molecule_info"]["products"]["products"]
        ]
        # print(rxn["reaction_molecule_info"]["products"]["products"])
        # print(len(products))

        #print(reactants)
        #print(products)
        mappings = rxn["mappings"]
        # print the sum of all the mappings lengths
        sum_length = sum([len(m) for m in mappings["atom_map"][0]])
        print("atom map items: {}".format(sum_length))
        print("atom map items: {}".format([len(m) for m in mappings["atom_map"][0]]))
        print("atom map items: {}".format([len(m) for m in mappings["atom_map"][1]]))

        print("bond map items: {}".format(sum_length))
        print("bond map items: {}".format([len(m) for m in mappings["bond_map"][0]]))
        print("bond map items: {}".format([len(m) for m in mappings["bond_map"][1]]))
        
        #reactant_atom_map = rxn["reaction_molecule_info"]["reactants"]["atom_map"]
        #product_atom_map = rxn["reaction_molecule_info"]["products"]["atom_map"]
        #reactant_bond_map = rxn["reaction_molecule_info"]["reactants"]["bond_map"]
        #product_bond_map = rxn["reaction_molecule_info"]["products"]["bond_map"]
        
        #print(mappings)
        
        g, fts = create_rxn_graph(
            reactants=reactants,
            products=products,
            mappings=mappings,
            device=device,
            has_bonds=None,
            reverse=False,
            reactant_only=False,
            empty_graph_fts=None,
        )
    """
    


0
1
2
3
4
5
6
7
8
9
10
11


In [23]:
reactions[0].keys()

dict_keys(['reaction_index', 'reaction_molecule_info', 'label', 'reverse_label', 'extra_info', 'mappings'])

In [32]:
project_name = "test_multi_gpu"


trainer = pl.Trainer(
    max_epochs=2,
    accelerator="gpu",
    devices=[0],
    accumulate_grad_batches=5,
    enable_progress_bar=True,
    gradient_clip_val=1.0,
    enable_checkpointing=True,
    precision=32,
)

trainer.fit(model, dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/home/santiagovargas/anaconda3/envs/bondnet_new/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

   | Name            | Type              | Params
-------------------------------------------------------
0  | embedding       | UnifySize         | 124   
1  | gated_layers    | ModuleList        | 1.6 K 
2  | readout_layer   | Set2SetThenCat    | 2.6 K 
3  | fc_layers       | ModuleList        | 3.3 K 
4  | loss            | MeanSquaredError  | 0     
5  | train_r2        | R2Sc

Epoch 0:   0%|          | 0/1189 [00:00<?, ?it/s] reactant ft  torch.Size([15, 10]) torch.Size([15, 10]) 15 15 1 atom
reactant ft  torch.Size([18, 10]) torch.Size([17, 10]) 17 17 1 bond
Epoch 0:   0%|          | 1/1189 [00:00<06:22,  3.11it/s, v_num=20]reactant ft  torch.Size([9, 10]) torch.Size([9, 10]) 9 9 1 atom
reactant ft  torch.Size([10, 10]) torch.Size([8, 10]) 8 8 1 bond
Epoch 0:   0%|          | 2/1189 [00:00<03:27,  5.71it/s, v_num=20]reactant ft  torch.Size([17, 10]) torch.Size([17, 10]) 17 17 1 atom
reactant ft  torch.Size([18, 10]) torch.Size([16, 10]) 16 16 1 bond
Epoch 0:   0%|          | 3/1189 [00:00<02:30,  7.86it/s, v_num=20]reactant ft  torch.Size([21, 10]) torch.Size([21, 10]) 21 21 1 atom
reactant ft  torch.Size([22, 10]) torch.Size([21, 10]) 21 21 1 bond
Epoch 0:   0%|          | 4/1189 [00:00<02:01,  9.75it/s, v_num=20]reactant ft  torch.Size([9, 10]) torch.Size([9, 10]) 9 9 1 atom
reactant ft  torch.Size([9, 10]) torch.Size([9, 10]) 9 9 1 bond
Epoch 0:   0%|   

MisconfigurationException: ReduceLROnPlateau conditioned on metric val_loss which is not available. Available metrics are: ['train_loss', 'train_r2', 'train_l1', 'train_mse']. Condition can be set using `monitor` key in lr scheduler dict

In [20]:
from bondnet.model.training_utils import get_grapher
from bondnet.data.dataset import ReactionNetworkDatasetGraphs
from bondnet.data.dataloader import DataLoaderReactionNetworkParallel, collate_parallel

In [21]:

dataset_loc = "/home/santiagovargas/dev/bondnet/bondnet/dataset/rapter_new_parse/qtaim/test_rapter_filtered_species.pkl"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

extra_keys = {"bond": ["bond_length"]}
precision = "32"

if precision == "16" or precision == "32":
    precision = int(precision)

extra_keys = {}

dataset = ReactionNetworkDatasetGraphs(
    grapher=get_grapher(extra_keys),
    file=dataset_loc,
    target="ts",
    classifier=False,
    classif_categories=3,
    filter_species=[5, 5],
    filter_outliers=False,
    filter_sparse_rxns=False,
    debug=False,
    extra_keys={"bond":["bond_length"]},
    extra_info={}
)

fg_list None
reading file from: /home/santiagovargas/dev/bondnet/bondnet/dataset/rapter_new_parse/qtaim/test_rapter_filtered_species.pkl
rxn raw len: 1188
Program finished in 6.885734417941421 seconds
.............failures.............
reactions len: 1188
valid ind len: 1188
bond break fail count: 		0
default fail count: 		0
sdf map fail count: 		0
product bond fail count: 	0
about to group and organize
number of grouped reactions: 1188
---> generating grouped reactions


grouped reactions:  43%|████▎     | 508/1188 [00:08<00:10, 65.52it/s]

grouped reactions:  45%|████▍     | 532/1188 [00:08<00:10, 62.14it/s]

In [24]:
dataset.feature_size

{'atom': 20, 'bond': 8, 'global': 3}

In [25]:
dataloader_normal = DataLoaderReactionNetworkParallel(
    dataset, batch_size=100, shuffle=True, collate_fn=collate_parallel
)

NameError: name 'DataLoaderReactionNetworkParallel' is not defined

In [26]:
next(iter(dataloader_normal))

NameError: name 'dataloader_normal' is not defined

In [27]:
config = get_defaults()

config = {
    "model": {
        "extra_features": [],
        "extra_info": [],
        "debug": False,
        "classifier": False,
        "classif_categories": 3,
        "filter_species": [3, 6],
        "filter_outliers": False,
        "filter_sparse_rxns": False,
        "restore": False,
    },
    "optim": {
        "val_size": 0.1,
        "test_size": 0.1,
        "batch_size": 4,
        "num_workers": 1,
    },
}

dataset_loc = "../../../tests/data/testdata/barrier_100.json"
config = {
    "dataset": {
        "data_dir": dataset_loc,
        "target_var": "ts",
    },
    "model": {
        "extra_features": [],
        "extra_info": [],
        "debug": False,
        "classifier": False,
        "classif_categories": 3,
        "filter_species": [3, 6],
        "filter_outliers": False,
        "filter_sparse_rxns": False,
        "restore": False,
    },
    "optim": {
        "val_size": 0.2,
        "test_size": 0.2,
        "batch_size": 4,
        "num_workers": 1,
    },
}
config_model = get_defaults()
# update config with model settings
for key, value in config_model["model"].items():
    config["model"][key] = value
for key, value in config_model["model"].items():
    config["model"][key] = value
    
#from bondnet.data.datamodule import BondNetLightningDataModule
#dm = BondNetLightningDataModule(config)
# feat_size, feat_name = dm.prepare_data()
# config["model"]["in_feats"] = feat_size
# config["model"]["in_feats"] = feat_size
# config = get_defaults()
#config["model"]["in_feats"] = dataset.feature_info["feature_size"]
#reaction = dataset.reaction_network.reactions[0]
config["model"]["in_feats"] = dataset.feature_size
model = load_model_lightning(config["model"], load_dir="./test_lmdb/")

NB: using GatedGCNConv
NB: using Set2SetThenCat
:::NO INITIALIZER USED:::


In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
nodes = ["atom", "bond", "global"]
for it, (batched_graph, label) in enumerate(dataloader_normal):
    feats = {nt: batched_graph.nodes[nt].data["ft"] for nt in nodes}
    target = label["value"].view(-1).to(device)
    norm_atom = None
    norm_bond = None
    stdev = torch.tensor([1.0])
    # print(feats.keys())
    # if device is not None:
    # feats = {k: v.to(device) for k, v in feats.items()}
    # target = target.to(device)
    # norm_atom = norm_atom.to(device)
    # norm_bond = norm_bond.to(device)
    # stdev = stdev.to(device)

    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"].keys())
    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"]["num_bonds_total"])
    #print(label["reaction"][0]["reaction_molecule_info"]["mappings"]["num_atoms_total"])
    
    #reactions = label["reaction"]
    #for nt, ft in feats.items():
    #    batched_graph.nodes[nt].data.update({"ft": ft})

    #graphs = dgl.unbatch(batched_graph)
    #reaction[0]
    #print(reaction[0]["reaction_graph"])
    #print(reaction[0]["reaction_feature"])

    #print(reactions[0]["mappings"])
    #print(reactions[0]["reaction_graph"])
    #print(reactions[0]["reaction_feature"])

    #print(reactions[0]["reaction_feature"]["global"].shape)
    #print(reactions[0]["reaction_feature"]["bond"].shape)
    #print(reactions[0]["reaction_feature"]["atom"].shape)
    #print(reactions[0]["mappings"])
    #print(reactions[0]["reaction_feature"])
    #print(reactions[0]["reaction_feature"])
    model(
        graph=batched_graph,
        feats=feats,
        reactions=reactions,
        norm_atom=norm_atom,
        norm_bond=norm_bond,
        reverse=False,
    )

NameError: name 'dataloader_normal' is not defined

In [None]:
reactions[0]

<bondnet.data.reaction_network.ReactionInNetwork at 0x7f002bd2cc90>