In [None]:
!pip install schnetpack

In [None]:
!pip uninstall --y numpy
!pip install numpy==1.23

# Import Libraries

In [None]:
import os
import time
import schnetpack as spk
from schnetpack.datasets import QM9
import schnetpack.transform as trn

import numpy as np
from tqdm import tqdm

from NeuralModel import AtomNeuralNet
from NeuralModel import TrainModel
from NeuralModel import Inference

Instructions for updating:
non-resource variables are not supported in the long term


# Auxliary Functions

In [None]:
def Euclidian_Distance(vec_a, vec_b):
  aux = vec_a - vec_b
  aux = aux**2
  return sum(aux)**0.5


def FeatureMatrix(simbols, Max_dim):
  elementos_dict =  {6: 0, 9: 1, 1: 2, 7: 3, 8: 4} #QM9
  matrix_features = np.zeros(shape=(Max_dim, len(elementos_dict)), dtype=float, order='C')

  dimensao = len(simbols)
  for i in range(dimensao):
    r = elementos_dict[simbols[i]]
    matrix_features[i,r] = 1

  return matrix_features

def Distance_Matrix(coordinates, max_dim):
  matrix_dist = np.zeros(shape=(max_dim, max_dim), dtype=float, order='C')

  n_atoms = coordinates.shape[0]

  for i in range(n_atoms):
    for j in range(i, n_atoms):
      vec_a = coordinates[i]
      vec_b = coordinates[j]
      dist = Euclidian_Distance(vec_a, vec_b)
      matrix_dist[i,j] = dist
      matrix_dist[j,i] = dist

  return matrix_dist

###  Choose the Property



In [None]:
molecule_property = 'U0'
#molecule_property = 'U'
#molecule_property = 'H'
#molecule_property = 'G'

# Load Data
* Here we use the data available in schnetpack.

In [None]:
if molecule_property == 'U':
  aux_prop = QM9.U
elif molecule_property == 'U0':
  aux_prop = QM9.U0
elif molecule_property == 'H':
  aux_prop = QM9.H
elif molecule_property == 'G':
  aux_prop = QM9.G

qm9tut = './qm9tut'
if not os.path.exists('qm9tut'):
    os.makedirs(qm9tut)

qm9data = QM9(
    './qm9.db',
    batch_size=32,
    num_train=110000,
    num_val=10000,
    transforms=[
        trn.ASENeighborList(cutoff=5.),
        #trn.RemoveOffsets(aux_prop, remove_mean=True, remove_atomrefs=True),
        trn.CastTo32()
    ],
    property_units={aux_prop: 'eV'},
    num_workers=1,
    split_file=os.path.join(qm9tut, "split.npz"),
    pin_memory=True, # set to false, when not using a GPU
    load_properties=[aux_prop], #only load U0 property
)
qm9data.prepare_data()
qm9data.setup()

100%|██████████| 133885/133885 [02:11<00:00, 1016.41it/s]


# Exploring the dataset
* Checking how the data is available in the dataset

In [None]:
index_molecule = 20

In [None]:
# Atoms positions (x,y,z) coordinates
qm9data.train_dataset[index_molecule]['_positions'].tolist()

[[-0.2540229558944702, 1.381105899810791, -0.12841194868087769],
 [1.0786817073822021, 1.0764003992080688, 0.24904243648052216],
 [0.16221964359283447, -0.011173426173627377, 0.13368827104568481],
 [-0.23729750514030457, -0.6998353004455566, 1.388474464416504],
 [-0.2970220744609833, -0.14717115461826324, 2.533884048461914],
 [-0.719992458820343, -1.1729893684387207, 3.499551773071289],
 [-1.003083348274231, -2.4183449745178223, 2.6593503952026367],
 [-2.005434274673462, -2.314920425415039, 1.5754659175872803],
 [-0.5606759786605835, -2.097254991531372, 1.278766393661499],
 [-0.845435380935669, 1.8904658555984497, 0.6296995878219604],
 [-0.39429640769958496, 1.7099884748458862, -1.1575686931610107],
 [0.3398406505584717, -0.6779918670654297, -0.7070125341415405],
 [0.08816585689783096, -1.3524638414382935, 4.218472957611084],
 [-1.5842136144638062, -0.816853940486908, 4.071690082550049],
 [-0.7190896272659302, -3.39628267288208, 3.0329225063323975],
 [-2.4234111309051514, -3.2138395309

In [None]:
# atomic numbers
qm9data.train_dataset[index_molecule]['_atomic_numbers'].tolist()

[6, 8, 6, 6, 7, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
# Target value
print("Target", aux_prop, qm9data.train_dataset[index_molecule][aux_prop].tolist()[0])

Target energy_U0 -11373.28515625


In [None]:
# Reference Values
# Reference values are constant associated to each chemical element which the model does not need to learn.
# This is why these values are removed during the data preparation.
atomic_number = 6
atomrefs = qm9data.train_dataset.atomrefs
print("Atomic number:", atomic_number, "    Reference Value:", atomrefs[aux_prop][atomic_number].item())

Atomic number: 6     Reference Value: -1029.863037109375


# Prepare Data for Model Training
* Create the features matrices - one hot encoding based on chemical element
* Create the atoms distance matrix

In [None]:
lst_features_treino = list()
lst_distancias_treino = list()
lst_mol_sizes_treino = list()
lst_target_treino = list()
lst_batch_ref_atms_treino = list()


lst_features_valid = list()
lst_distancias_valid = list()
lst_mol_sizes_valid = list()
lst_target_valid = list()
lst_batch_ref_atms_valid = list()

In [None]:
print("Loading Training data ...")
inicio = time.time()
######################## Train Data #############################
for idx in tqdm(range(int(len(qm9data.train_dataset)))):
  val_target = qm9data.train_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.train_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.train_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.train_dataset[idx]['_atomic_numbers'].tolist()

  atomrefs = qm9data.train_dataset.atomrefs
  aux_ref_target_val = 0
  for atm_aux in qm9data.train_dataset[idx]['_atomic_numbers'].tolist():
    aux_ref_target_val = aux_ref_target_val + atomrefs[aux_prop][atm_aux].item()


  lst_features_treino.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_treino.append(val_target)
  lst_distancias_treino.append(val_dist)
  lst_batch_ref_atms_treino.append(aux_ref_target_val)
  lst_mol_sizes_treino.append(val_mol_size)

######################## Validation Data #############################
for idx in tqdm(range(int(len(qm9data.val_dataset)))):
  val_target = qm9data.val_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.val_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.val_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.val_dataset[idx]['_atomic_numbers'].tolist()

  atomrefs = qm9data.val_dataset.atomrefs
  aux_ref_target_val = 0
  for atm_aux in qm9data.val_dataset[idx]['_atomic_numbers'].tolist():
    aux_ref_target_val = aux_ref_target_val + atomrefs[aux_prop][atm_aux].item()


  lst_features_valid.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_valid.append(val_target)
  lst_distancias_valid.append(val_dist)
  lst_batch_ref_atms_valid.append(aux_ref_target_val)
  lst_mol_sizes_valid.append(val_mol_size)

fim = time.time()
print("Time(s) for data loading:", fim-inicio)

Loading Training data ...


100%|██████████| 110000/110000 [29:11<00:00, 62.81it/s]
100%|██████████| 10000/10000 [02:40<00:00, 62.25it/s]

Time(s) for data loading: 1911.904590845108





# Target Variable transformation
* the schnetpack provides the data standarization process but, for the sake of clarity let's do it manually.
* $t_{m}$ = molecule target value without transformation
* $A_{m}$ = number of atoms in the molecule m
* $\widetilde{t}_{m} = \frac{t_{m} - ref_{m}}{A_{m}} $
* $\bar{\widetilde{t}} = \frac{1}{n} \cdot \sum_{m=1}^{n} \widetilde{t}_{m}$
* $\sigma_{\widetilde{t}}^{2} = \frac{1}{(n-1)} \sum_{m=1}^{n} (\widetilde{t}_{m} - \bar{\widetilde{t}})^{2}$
* $t_{m} = ref_{m} + A_{m} \cdot \bar{\widetilde{t}} + t'_{m} \cdot \sigma_{\widetilde{t}}$

In [None]:
### Train Data
lst_target_treino = [(lst_target_treino[i] - lst_batch_ref_atms_treino[i])/lst_mol_sizes_treino[i] for i in range(len(lst_target_treino))]
mean_target_aux = np.mean(lst_target_treino)
std_target_aux = np.std(lst_target_treino)
lst_target_treino = [lst_mol_sizes_treino[i]*(lst_target_treino[i]-mean_target_aux)/std_target_aux for i in range(len(lst_target_treino))]
print("Statistics for TRAIN data:    ", "Mean per Atom:", mean_target_aux, "        Std per Atom:", std_target_aux)


## Validation Data
lst_target_valid = [(lst_target_valid[i] - lst_batch_ref_atms_valid[i])/lst_mol_sizes_valid[i] for i in range(len(lst_target_valid))]
mean_target_aux_valid = np.mean(lst_target_valid)
std_target_aux_valid = np.std(lst_target_valid)
lst_target_valid = [lst_mol_sizes_valid[i]*(lst_target_valid[i]-mean_target_aux_valid)/std_target_aux_valid for i in range(len(lst_target_valid))]

print("Statistics for Validation data:    ", "Mean per Atom:", mean_target_aux_valid, "        Std per Atom:", std_target_aux_valid)

Statistics for TRAIN data:     Mean per Atom: -4.244001794357475         Std per Atom: 0.18930933218981855
Statistics for Validation data:     Mean per Atom: -4.2446318784791615         Std per Atom: 0.1886722538866451


# Model Training

In [None]:
NeuralNetModel = AtomNeuralNet()

In [None]:
TrainModel(NeuralNetModel, lst_features_treino,
          lst_target_treino, lst_distancias_treino,
          lst_mol_sizes_treino, std_target_aux,
          lst_features_valid, lst_target_valid,
          lst_distancias_valid, lst_mol_sizes_valid,
          n_epochs = 600, n_batch = 64)

Epoch: 1 MAE Train: 0.6858850358081615       MAE Validation: 0.7284603960203826       Time(s): 89.12833952903748 
Epoch: 2 MAE Train: 0.45743298338104604       MAE Validation: 0.4112506195961745       Time(s): 86.41237926483154   Weights Updated :-)
Epoch: 3 MAE Train: 0.5319186738220291       MAE Validation: 0.5079303921740761       Time(s): 86.06351971626282 
Epoch: 4 MAE Train: 0.2839173914819447       MAE Validation: 0.36509796196531247       Time(s): 86.33245897293091   Weights Updated :-)
Epoch: 5 MAE Train: 0.2012293204347878       MAE Validation: 0.22442379500399448       Time(s): 86.39337873458862   Weights Updated :-)
Epoch: 6 MAE Train: 0.19752579424588576       MAE Validation: 0.1607577544906828       Time(s): 86.43060731887817   Weights Updated :-)
Epoch: 7 MAE Train: 0.21646791999685522       MAE Validation: 0.16591543325490463       Time(s): 86.17360281944275 
Epoch: 8 MAE Train: 0.22297193145536187       MAE Validation: 0.2328965601368758       Time(s): 86.0674161911010

# Inference
* Predict target value in test dataset

In [None]:
lst_target_test = list()
lst_features_test = list()
lst_distancias_test = list()
lst_mol_sizes_test = list()

lst_mol_reference_test = list()

In [None]:
for idx in tqdm(range(int(len(qm9data.test_dataset)))):

  val_target = qm9data.test_dataset[idx][aux_prop].tolist()[0]
  val_dist =  np.array([Distance_Matrix(np.array(qm9data.test_dataset[idx]['_positions'].tolist()), 29)])
  val_mol_size = len(qm9data.test_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.test_dataset[idx]['_atomic_numbers'].tolist()

  atomrefs = qm9data.test_dataset.atomrefs
  aux_ref_target_val = 0
  for atm_aux in qm9data.test_dataset[idx]['_atomic_numbers'].tolist():
    aux_ref_target_val = aux_ref_target_val + atomrefs[aux_prop][atm_aux].item()


  lst_features_test.append(np.array([FeatureMatrix(val_atm_numbers, 29)]))
  lst_distancias_test.append(val_dist)
  lst_mol_sizes_test.append(val_mol_size)
  lst_target_test.append(val_target)
  lst_mol_reference_test.append(aux_ref_target_val)

100%|██████████| 13885/13885 [03:39<00:00, 63.23it/s]


In [None]:
pdct_test = Inference(NeuralNetModel, lst_features_test, lst_distancias_test, lst_mol_sizes_test)
pdct_test_inverse_transformation = np.array(lst_mol_reference_test) + np.array(lst_mol_sizes_test)*mean_target_aux + np.array(pdct_test)*std_target_aux
print("Mean Absolute Error:", np.mean(abs(np.array(lst_target_test) - pdct_test_inverse_transformation)))

Mean Absolute Error: 0.016819780126037118
