In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
import networkx as nx

import torch
import dgl
from tqdm import tqdm
from qtaim_embed.utils.grapher import get_grapher

from qtaim_embed.data.molwrapper import mol_wrappers_from_df
from qtaim_embed.utils.tests import get_data
from qtaim_embed.core.dataset import HeteroGraphNodeLabelDataset
from qtaim_embed.data.dataloader import DataLoaderMoleculeNodeTask
from qtaim_embed.models.node_level.base_gcn import GCNNodePred

In [2]:
# df = pd.read_pickle(
#    "/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_qtaim_labelled.pkl"
# )

dataset = HeteroGraphNodeLabelDataset(
    # file="/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_full.pkl",
    file="/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_qtaim_labelled.pkl",
    allowed_ring_size=[3, 4, 5, 6, 7],
    allowed_charges=None,
    self_loop=True,
    extra_keys={
        "atom": ["extra_feat_atom_esp_total"],
        "bond": [
            "bond_length",
            "extra_feat_bond_esp_total",
            # "extra_feat_bond_ellip_e_dens",
            # "extra_feat_bond_eta",
        ],
        "global": ["extra_feat_global_E1_CAM"],
    },
    target_dict={
        # "atom": ["extra_feat_atom_esp_total"],
        # "bond": [
        #    "extra_feat_bond_esp_total",
        #    # "extra_feat_bond_ellip_e_dens",
        #    # "extra_feat_bond_eta",
        # ],
        "global": ["extra_feat_global_E1_CAM"]
    },
    extra_dataset_info={},
    debug=True,
    log_scale_features=True,
    log_scale_targets=False,
    standard_scale_features=True,
    standard_scale_targets=True,
)

bond = [
    "extra_feat_bond_esp_total",
    "extra_feat_bond_ellip_e_dens",
    "extra_feat_bond_eta",
]

ModuleNotFoundError: No module named 'pickle5'

In [3]:
print(len(dataset))

100


In [4]:
# split dataset into train and val
from torch.utils.data import random_split

generator = torch.Generator().manual_seed(42)
train_size = int(0.5 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(
    dataset, [train_size, val_size], generator=generator
)


train_dataloader = DataLoaderMoleculeNodeTask(
    train_dataset, batch_size=100, shuffle=True, num_workers=4
)

batch_graph, batch_label = next(iter(train_dataloader))
val_dataloader = DataLoaderMoleculeNodeTask(
    val_dataset, batch_size=100, shuffle=True, num_workers=4
)

In [5]:
len_dict = dataset.feature_size()
atom_input_size = len_dict["atom"]
bond_input_size = len_dict["bond"]
global_input_size = len_dict["global"]

model = GCNNodePred(
    atom_input_size=atom_input_size,
    bond_input_size=bond_input_size,
    global_input_size=global_input_size,
    target_dict={"global": ["extra_feat_global_E1_CAM"]},
    activation="ReLU",
    conv_fn="ResidualBlock",
    resid_n_graph_convs=3,
    dropout=0.2,
    bias=True,
    batch_norm=True,
    n_conv_layers=10,
    lr_plateau_patience=10,
    lr=0.02,
    weight_decay=0.0001,
)

{'a2b': {'in_feats': 13, 'out_feats': 9, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'b2a': {'in_feats': 9, 'out_feats': 13, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'a2g': {'in_feats': 13, 'out_feats': 3, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'b2g': {'in_feats': 9, 'out_feats': 3, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'g2a': {'in_feats': 3, 'out_feats': 13, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'g2b': {'in_feats': 3, 'out_feats': 9, 'norm': 'both', 'bias': True, 'activation': ReLU(), 'allow_zero_in_degree': True, 'dropout': 0.2, 'batch_norm_tf': True}, 'a2a': {'in_feats': 13, 'out_

In [6]:
"""
feats = batch_graph.ndata["feat"]
for layer in model.conv_layers:
    feats = layer(batch_graph, feats)


for key in ["atom", "bond", "global"]:
    feats[key] = _split_batched_output(batch_graph, feats[key], key)


def get_targets(self, targets_feats):
    targets = {}
    for k, v in self.target_dict.items():
        # if v is None or [] skip 
        if not (v is None or len(v) == 0):
            targets[k] = targets_feats[k]
    #[print(i) for i in list(targets.values())]
    # concat dict of tensors into one tensor
    list(targets.values())
    targets = torch.cat(list(targets.values()), dim=1)
    return targets    

targets = get_targets(model, feats)
"""

'\nfeats = batch_graph.ndata["feat"]\nfor layer in model.conv_layers:\n    feats = layer(batch_graph, feats)\n\n\nfor key in ["atom", "bond", "global"]:\n    feats[key] = _split_batched_output(batch_graph, feats[key], key)\n\n\ndef get_targets(self, targets_feats):\n    targets = {}\n    for k, v in self.target_dict.items():\n        # if v is None or [] skip \n        if not (v is None or len(v) == 0):\n            targets[k] = targets_feats[k]\n    #[print(i) for i in list(targets.values())]\n    # concat dict of tensors into one tensor\n    list(targets.values())\n    targets = torch.cat(list(targets.values()), dim=1)\n    return targets    \n\ntargets = get_targets(model, feats)\n'

In [7]:
feats = model.forward(batch_graph, batch_graph.ndata["feat"])

In [8]:
feats["atom"].shape

torch.Size([804, 13])

In [9]:
"""
optimizer, lr_scheduler = model.configure_optimizers()
optimizer = optimizer[0]
lr_scheduler = lr_scheduler[0]
"""

'\noptimizer, lr_scheduler = model.configure_optimizers()\noptimizer = optimizer[0]\nlr_scheduler = lr_scheduler[0]\n'

In [10]:
import pytorch_lightning as pl

torch.set_float32_matmul_precision("high")

trainer_transfer = pl.Trainer(
    max_epochs=100,
    accelerator="gpu",
    devices=1,
    enable_progress_bar=True,
    gradient_clip_val=3.0,
    default_root_dir="./test/",
    precision="bf16",
    log_every_n_steps=10,
)

trainer_transfer.fit(model, train_dataloader, val_dataloader)

  rank_zero_warn(
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

   | Name            | Type               | Params
--------------------------------------------------------
0  | conv_layers     | ModuleList         | 10.1 K
1  | loss            | MultioutputWrapper | 0     
2  | train_r2        | MultioutputWrapper | 0     
3  | train_torch_l1  | MultioutputWrapper | 0     
4  | train_torch_mse | MultioutputWrapper | 0     
5  | val_r2          | MultioutputWrapper | 0     
6  | val_torch_l1    | MultioutputWrapper | 0     
7  | val_torch_mse   | MultioutputWrapper | 0     
8  | test_r2         | MultioutputWrapper | 0     
9  | test_torch_l1   | MultioutputWrapper | 0     
10 | test_torch_mse  | MultioutputWrapper | 0     
--------------------------------------------------------
10.1 K    Tr

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00012: reducing learning rate of group 0 to 1.0000e-02.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00023: reducing learning rate of group 0 to 5.0000e-03.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f2926bb51c0>
Traceback (most recent call last):
  File "/home/santiagovargas/anaconda3/envs/qtaim_embed/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/santiagovargas/anaconda3/envs/qtaim_embed/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/home/santiagovargas/anaconda3/envs/qtaim_embed/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/santiagovargas/anaconda3/envs/qtaim_embed/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/santiagovargas/anaconda3/envs/qtaim_embed/lib/python3.11/multiprocessing/connection.py", line 9

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00034: reducing learning rate of group 0 to 2.5000e-03.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00045: reducing learning rate of group 0 to 1.2500e-03.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00056: reducing learning rate of group 0 to 6.2500e-04.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00067: reducing learning rate of group 0 to 3.1250e-04.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00078: reducing learning rate of group 0 to 1.5625e-04.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00089: reducing learning rate of group 0 to 7.8125e-05.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 00100: reducing learning rate of group 0 to 3.9063e-05.


In [None]:
df = pd.read_pickle(
    "/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_qtaim.pkl"
)

In [None]:
df.names

In [None]:
root_xyz = "../../../data/xyz_qm8/xyz/"
csv = "../../../data/xyz_qm8/qm8.sdf.csv"
# open csv file
df_labels = pd.read_csv(csv)
# print(df.head())
data_added = [[] for i in range(len(df))]
col_names_labels = list(df_labels.columns)
for i in tqdm(range(len(df))):
    name = df.names[i]
    # get second line of xyz file
    with open(root_xyz + name, "r") as f:
        line = f.readlines()[1]
    id = float(line.split()[1])
    row_hit = df_labels[df_labels["gdb9_index"] == id]

    # convert to list
    row_hit = row_hit.values.tolist()[0]
    # append to data_added
    data_added[i] = row_hit

# add to df with column names with prefix "extra_feat_global_" and replace . with _ and - with _
for i in range(len(col_names_labels)):
    df[
        "extra_feat_global_" + col_names_labels[i].replace(".", "_").replace("-", "_")
    ] = [row[i] for row in data_added]

In [None]:
df.columns

In [None]:
df.to_pickle(
    "/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_qtaim_labelled.pkl"
)

In [4]:
import pandas as pd

df = pd.read_pickle(
    "/home/santiagovargas/dev/qtaim_embed/data/xyz_qm8/molecules_qtaim_labelled.pkl"
)
df = df.head(100)
df.to_pickle("/home/santiagovargas/dev/qtaim_embed/tests/data/labelled_data.pkl")