In [1]:
import pandas as pd
from schnetpack.data import ASEAtomsData
import numpy as np
from ase import Atoms
import schnetpack as spk
import schnetpack.transform as trn
from schnetpack.datasets import QM9
from schnetpack.transform import ASENeighborList

"""qm9data = QM9(
    "./qm9.db",
    batch_size=10,
    num_train=110000,
    num_val=10000,
    transforms=[ASENeighborList(cutoff=5.0)],
)
qm9data.prepare_data()
qm9data.setup()"""


'qm9data = QM9(\n    "./qm9.db",\n    batch_size=10,\n    num_train=110000,\n    num_val=10000,\n    transforms=[ASENeighborList(cutoff=5.0)],\n)\nqm9data.prepare_data()\nqm9data.setup()'

In [2]:
data = np.load("./qm8_train_dimenet.npz")

atoms_list = []
property_list = []
numbers = data["N"]
R = data["R"]
Z = data["Z"]
atom_count = 0
for mol_ind, mol in enumerate(numbers):
    ats = Atoms(
        positions=R[atom_count : atom_count + mol],
        numbers=Z[atom_count : atom_count + mol],
    )
    atoms_list.append(ats)
    atom_count += mol
    properties = {
        "E1_CC2": [data["E1_CC2"][mol_ind]],
        "E2_CC2": [data["E2_CC2"][mol_ind]],
    }
    property_list.append(properties)

print("Properties:", property_list[0])


Properties: {'E1_CC2': [0.1692052], 'E2_CC2': [0.25455461]}


In [4]:
new_dataset = ASEAtomsData.create(
    "./qm8_train_schnet.db",
    distance_unit="Ang",
    property_unit_dict={"E1_CC2": "Hartree", "E2_CC2": "Hartree"},
)
new_dataset.add_systems(property_list, atoms_list)

custom_data = spk.data.AtomsDataModule(
    "./qm8_train_schnet.db",
    batch_size=10,
    distance_unit="Ang",
    property_units={
        "E1_CC2": "Hartree",
        # "E2_CC2": "Hartree",
    },
    num_train=18607,
    num_val=1000,
    transforms=[
        trn.ASENeighborList(cutoff=5.0),
        trn.CastTo32(),
    ],
    num_workers=1,
    split_file="./qm8_split.npz",
    pin_memory=True,  # set to false, when not using a GPU
    load_properties=["E1_CC2"],
)
custom_data.prepare_data()
custom_data.setup()


In [5]:
cutoff = 5.0
n_atom_basis = 30

pairwise_distance = (
    spk.atomistic.PairwiseDistances()
)  # calculates pairwise distances between atoms
radial_basis = spk.nn.GaussianRBF(n_rbf=20, cutoff=cutoff)
schnet = spk.representation.SchNet(
    n_atom_basis=n_atom_basis,
    n_interactions=3,
    radial_basis=radial_basis,
    cutoff_fn=spk.nn.CosineCutoff(cutoff),
)

pred_e1 = spk.atomistic.Atomwise(n_in=n_atom_basis, output_key="E1_CC2")

painn = spk.representation.PaiNN(
    n_atom_basis=n_atom_basis,
    radial_basis=radial_basis,
    cutoff_fn=spk.nn.CosineCutoff(cutoff),
    n_interactions=3,
    activation=spk.nn.activations.shifted_softplus,
    shared_interactions=True,
    shared_filters=True,
)


# pred_e2 = spk.atomistic.Atomwise(n_in=n_atom_basis, output_key="E2_CC2")
nnpot = spk.model.NeuralNetworkPotential(
    representation=painn,
    input_modules=[pairwise_distance],
    output_modules=[pred_e1],
    postprocessors=[
        trn.CastTo64(),
    ],
)


In [6]:
import torch, torchmetrics
import pytorch_lightning as pl
import os

output_e1 = spk.task.ModelOutput(
    name="E1_CC2",
    loss_fn=torch.nn.MSELoss(),
    loss_weight=1.0,
    metrics={"MAE": torchmetrics.MeanAbsoluteError(), "r2": torchmetrics.R2Score()},
)


output_e2 = spk.task.ModelOutput(
    name="E2_CC2",
    loss_fn=torch.nn.MSELoss(),
    loss_weight=1.0,
    metrics={"MAE": torchmetrics.MeanAbsoluteError(), "r2": torchmetrics.R2Score()},
)


In [7]:
task = spk.task.AtomisticTask(
    model=nnpot,
    outputs=[output_e1],
    optimizer_cls=torch.optim.AdamW,
    optimizer_args={"lr": 1e-3},
)


/home/santiagovargas/anaconda3/envs/schnet/lib/python3.8/site-packages/pytorch_lightning/utilities/parsing.py:198: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [8]:
qm9tut = "./qm8_test"

logger = pl.loggers.TensorBoardLogger(save_dir=qm9tut)
callbacks = [
    spk.train.ModelCheckpoint(
        model_path=os.path.join(qm9tut, "best_inference_model"),
        save_top_k=1,
        monitor="val_loss",
    )
]

trainer = pl.Trainer(
    callbacks=callbacks,
    logger=logger,
    default_root_dir=qm9tut,
    max_epochs=10,  # for testing, we restrict the number of epochs
)
trainer.fit(task, datamodule=custom_data)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: ./qm8_test/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name    | Type                   | Params
---------------------------------------------------
0 | model   | NeuralNetworkPotential | 15.5 K
1 | outputs | ModuleList             | 0     
---------------------------------------------------
15.5 K    Trainable params
0         Non-trainable params
15.5 K    Total params
0.062     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/santiagovargas/anaconda3/envs/schnet/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

/home/santiagovargas/anaconda3/envs/schnet/lib/python3.8/site-packages/pytorch_lightning/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 10. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/home/santiagovargas/anaconda3/envs/schnet/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 1861/1861 [00:25<00:00, 71.71it/s, v_num=0, val_loss=0.000689]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1861/1861 [00:25<00:00, 71.71it/s, v_num=0, val_loss=0.000689]


In [9]:
data = np.load("./qm8_test_dimenet.npz")

atoms_list = []
property_list = []
numbers = data["N"]
R = data["R"]
Z = data["Z"]
atom_count = 0
for mol_ind, mol in enumerate(numbers):
    ats = Atoms(
        positions=R[atom_count : atom_count + mol],
        numbers=Z[atom_count : atom_count + mol],
    )
    atoms_list.append(ats)
    atom_count += mol
    properties = {
        "E1_CC2": [data["E1_CC2"][mol_ind]],
        # "E2_CC2": [data["E2_CC2"][mol_ind]],
    }
    property_list.append(properties)

# print("Properties:", property_list[0])

new_dataset = ASEAtomsData.create(
    "./qm8_test_schnet.db",
    distance_unit="Ang",
    property_unit_dict={
        "E1_CC2": "Hartree",
        # "E2_CC2": "Hartree",
    },
)
new_dataset.add_systems(property_list, atoms_list)


In [10]:
num_test = len(new_dataset)
custom_data_test = spk.data.AtomsDataModule(
    "./qm8_test_schnet.db",
    batch_size=10,
    distance_unit="Ang",
    property_units={
        "E1_CC2": "Hartree",
        # "E2_CC2": "Hartree",
    },
    num_train=0.01,
    num_val=0.99,
    num_test=0.0,
    transforms=[
        trn.ASENeighborList(cutoff=5.0),
        # trn.RemoveOffsets(QM9.U0, remove_mean=True, remove_atomrefs=True),
        trn.CastTo32(),
    ],
    num_workers=1,
    split_file="./qm8_split_test.npz",
    pin_memory=True,  # set to false, when not using a GPU
    load_properties=["E1_CC2"],
)
custom_data_test.prepare_data()
custom_data_test.setup()




In [11]:
# custom_data_test
trainer.validate(task, datamodule=custom_data_test)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation DataLoader 0: 100%|██████████| 216/216 [00:02<00:00, 79.95it/s]


/home/santiagovargas/anaconda3/envs/schnet/lib/python3.8/site-packages/pytorch_lightning/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 6. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


[{'val_loss': 0.0006873265374451876,
  'val_E1_CC2_MAE': 0.020474446937441826,
  'val_E1_CC2_r2': 0.6275290250778198}]

In [None]:
best_model = torch.load(os.path.join(qm9tut, "best_inference_model"))
best_model.cpu()

for batch in qm9data.test_dataloader():
    # result = nnpot(batch)

    targets = {
        output.target_property: batch[output.target_property]
        for output in task.outputs
        if not isinstance(output, UnsupervisedModelOutput)
    }
    try:
        targets["considered_atoms"] = batch["considered_atoms"]
    except:
        pass

    pred = task.predict_without_postprocessing(batch)
    pred, targets = task.apply_constraints(pred, targets)
    target_dict["U0"].append(pred["energy_U0"].detach().numpy())
    pred_dict["U0"].append(targets["energy_U0"].detach().numpy())
    print("Result dictionary:", pred)
    print("Target dictionary:", targets)
    # print("Result dictionary:", result)
    break
