In [None]:
!pip install schnetpack

In [None]:
!pip uninstall --y numpy
!pip install numpy==1.23

# Import Libraries

In [None]:
import os
import time
import schnetpack as spk
from schnetpack.datasets import QM9
import schnetpack.transform as trn

import numpy as np
from tqdm import tqdm

from NeuralModel import AtomNeuralNet
from NeuralModel import TrainModel
from NeuralModel import Inference

Instructions for updating:
non-resource variables are not supported in the long term


# Auxliary Functions

In [None]:
def Euclidian_Distance(vec_a, vec_b):
  aux = vec_a - vec_b
  aux = aux**2
  return sum(aux)**0.5


def FeatureMatrix(simbols, Max_dim):
  elementos_dict =  {6: 0, 9: 1, 1: 2, 7: 3, 8: 4} #QM9
  matrix_features = np.zeros(shape=(Max_dim, len(elementos_dict)), dtype=float, order='C')

  dimensao = len(simbols)
  for i in range(dimensao):
    r = elementos_dict[simbols[i]]
    matrix_features[i,r] = 1

  return matrix_features

def Distance_Matrix(coordinates, max_dim):
  matrix_dist = np.zeros(shape=(max_dim, max_dim), dtype=float, order='C')

  n_atoms = coordinates.shape[0]

  for i in range(n_atoms):
    for j in range(i, n_atoms):
      vec_a = coordinates[i]
      vec_b = coordinates[j]
      dist = Euclidian_Distance(vec_a, vec_b)
      matrix_dist[i,j] = dist
      matrix_dist[j,i] = dist

  return matrix_dist

###  Choose the Property



In [None]:
#molecule_property = 'U0'
#molecule_property = 'U'
#molecule_property = 'H'
#molecule_property = 'G'
molecule_property = 'mu'

# Load Data
* Here we use the data available in schnetpack.

In [None]:
#if molecule_property == 'U':
#  aux_prop = QM9.U
#elif molecule_property == 'U0':
#  aux_prop = QM9.U0
#elif molecule_property == 'H':
#  aux_prop = QM9.H
#elif molecule_property == 'G':
#  aux_prop = QM9.G
if molecule_property == 'mu':
  aux_prop = QM9.mu

qm9tut = './qm9tut'
if not os.path.exists('qm9tut'):
    os.makedirs(qm9tut)

qm9data = QM9(
    './qm9.db',
    batch_size=32,
    num_train=110000,
    num_val=10000,
    transforms=[
        trn.ASENeighborList(cutoff=5.),
        #trn.RemoveOffsets(aux_prop, remove_mean=True, remove_atomrefs=True),
        trn.CastTo32()
    ],
    property_units={aux_prop: 'Debye'},
    num_workers=1,
    split_file=os.path.join(qm9tut, "split.npz"),
    pin_memory=True, # set to false, when not using a GPU
    load_properties=[aux_prop], #only load U0 property
)
qm9data.prepare_data()
qm9data.setup()

100%|██████████| 133885/133885 [02:10<00:00, 1027.80it/s]


# Exploring the dataset
* Checking how the data is available in the dataset

In [None]:
index_molecule = 20

In [None]:
# Atoms positions (x,y,z) coordinates
qm9data.train_dataset[index_molecule]['_positions'].tolist()

[[0.22812213003635406, 1.5452567338943481, -0.12812365591526031],
 [0.0543658472597599, 0.060254618525505066, -0.021933341398835182],
 [-1.0649378299713135, -0.6777413487434387, 0.06716495007276535],
 [-2.5113894939422607, -0.2891347110271454, 0.12123972922563553],
 [-2.7000572681427, 1.1019022464752197, 0.3184383809566498],
 [-0.7582699656486511, -2.145333766937256, 0.05133020132780075],
 [-0.11224034428596497, -2.661831855773926, -1.2204532623291016],
 [0.7495883107185364, -2.250807762145996, -0.05494976416230202],
 [1.2920067310333252, -0.8277568221092224, -0.026578545570373535],
 [0.8608453869819641, 1.917217493057251, 0.6893472671508789],
 [0.7498617768287659, 1.8058675527572632, -1.058944821357727],
 [-0.7232884764671326, 2.0712363719940186, -0.10123038291931152],
 [-3.0004045963287354, -0.6263546943664551, -0.8107518553733826],
 [-2.9884562492370605, -0.86479252576828, 0.9350160956382751],
 [-3.6438446044921875, 1.279582142829895, 0.2791091203689575],
 [-1.3508384227752686, -2.8

In [None]:
# atomic numbers
qm9data.train_dataset[index_molecule]['_atomic_numbers'].tolist()

[6, 6, 6, 6, 8, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
# Target value
print("Target", aux_prop, qm9data.train_dataset[index_molecule][aux_prop].tolist()[0])

Target dipole_moment 1.4221999645233154


# Prepare Data for Model Training
* Create the features matrices - one hot encoding based on chemical element
* Create the atoms distance matrix

In [None]:
lst_features_treino = list()
lst_distancias_treino = list()
lst_mol_sizes_treino = list()
lst_target_treino = list()


lst_features_valid = list()
lst_distancias_valid = list()
lst_mol_sizes_valid = list()
lst_target_valid = list()

In [None]:
print("Loading Training data ...")
inicio = time.time()
######################## Train Data #############################
for idx in tqdm(range(int(len(qm9data.train_dataset)))):
  val_target = qm9data.train_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.train_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.train_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.train_dataset[idx]['_atomic_numbers'].tolist()


  lst_features_treino.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_treino.append(val_target)
  lst_distancias_treino.append(val_dist)
  lst_mol_sizes_treino.append(val_mol_size)

######################## Validation Data #############################
for idx in tqdm(range(int(len(qm9data.val_dataset)))):
  val_target = qm9data.val_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.val_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.val_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.val_dataset[idx]['_atomic_numbers'].tolist()


  lst_features_valid.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_valid.append(val_target)
  lst_distancias_valid.append(val_dist)
  lst_mol_sizes_valid.append(val_mol_size)

fim = time.time()
print("Time(s) for data loading:", fim-inicio)

Loading Training data ...


100%|██████████| 110000/110000 [21:59<00:00, 83.34it/s]
100%|██████████| 10000/10000 [02:01<00:00, 82.49it/s]

Time(s) for data loading: 1441.0882353782654





# Target Variable transformation
* the schnetpack provides the data standarization process but, for the sake of clarity let's do it manually.
* $t_{m}$ = molecule target value without transformation
* $A_{m}$ = number of atoms in the molecule m
* $\widetilde{t}_{m} = \frac{t_{m} - ref_{m}}{A_{m}} $
* $\bar{\widetilde{t}} = \frac{1}{n} \cdot \sum_{m=1}^{n} \widetilde{t}_{m}$
* $\sigma_{\widetilde{t}}^{2} = \frac{1}{(n-1)} \sum_{m=1}^{n} (\widetilde{t}_{m} - \bar{\widetilde{t}})^{2}$
* $t_{m} = ref_{m} + A_{m} \cdot \bar{\widetilde{t}} + t'_{m} \cdot \sigma_{\widetilde{t}}$

In [None]:
### Train Data
lst_target_treino = [lst_target_treino[i]/lst_mol_sizes_treino[i] for i in range(len(lst_target_treino))]
mean_target_aux = np.mean(lst_target_treino)
std_target_aux = np.std(lst_target_treino)
# here it was used Standard deviation = 1
lst_target_treino = [lst_mol_sizes_treino[i]*(lst_target_treino[i]-mean_target_aux)/1 for i in range(len(lst_target_treino))]
print("Statistics for TRAIN data:    ", "Mean per Atom:", mean_target_aux, "        Std per Atom:", std_target_aux)


## Validation Data
lst_target_valid = [lst_target_valid[i]/lst_mol_sizes_valid[i] for i in range(len(lst_target_valid))]
mean_target_aux_valid = np.mean(lst_target_valid)
std_target_aux_valid = np.std(lst_target_valid)
# here it was used Standard deviation = 1
lst_target_valid = [lst_mol_sizes_valid[i]*(lst_target_valid[i]-mean_target_aux_valid)/1 for i in range(len(lst_target_valid))]

print("Statistics for Validation data:    ", "Mean per Atom:", mean_target_aux_valid, "        Std per Atom:", std_target_aux_valid)

Statistics for TRAIN data:     Mean per Atom: 0.16013792486254508         Std per Atom: 0.10371352371774517
Statistics for Validation data:     Mean per Atom: 0.1608503739785964         Std per Atom: 0.1050803208935484


# Model Training

In [None]:
NeuralNetModel = AtomNeuralNet()

In [None]:
TrainModel(NeuralNetModel, lst_features_treino,
          lst_target_treino, lst_distancias_treino,
          lst_mol_sizes_treino, 1,
          lst_features_valid, lst_target_valid,
          lst_distancias_valid, lst_mol_sizes_valid,
          n_epochs = 600, n_batch = 64)

Epoch: 1 MAE Train: 0.8045922275027622       MAE Validation: 0.9062172961140247       Time(s): 89.0462155342102 
Epoch: 2 MAE Train: 0.6245461839442263       MAE Validation: 0.9008823170340572       Time(s): 85.28304600715637 
Epoch: 3 MAE Train: 0.6775304720854716       MAE Validation: 0.6853917660615704       Time(s): 85.17449235916138   Weights Updated :-)
Epoch: 4 MAE Train: 0.5666617170642858       MAE Validation: 0.5135594521589402       Time(s): 84.76932859420776   Weights Updated :-)
Epoch: 5 MAE Train: 0.4292939641418927       MAE Validation: 0.5432024912419307       Time(s): 85.14664125442505 
Epoch: 6 MAE Train: 0.40227504586216856       MAE Validation: 0.4656778512536721       Time(s): 85.34650301933289 
Epoch: 7 MAE Train: 0.33936258089239046       MAE Validation: 0.41151858860948404       Time(s): 85.89950561523438   Weights Updated :-)
Epoch: 8 MAE Train: 0.3000416546565401       MAE Validation: 0.21656013842649846       Time(s): 86.4231128692627   Weights Updated :-)
Ep

# Inference
* Predict target value in test dataset

In [None]:
lst_target_test = list()
lst_features_test = list()
lst_distancias_test = list()
lst_mol_sizes_test = list()

In [None]:
for idx in tqdm(range(int(len(qm9data.test_dataset)))):

  val_target = qm9data.test_dataset[idx][aux_prop].tolist()[0]
  val_dist =  np.array([Distance_Matrix(np.array(qm9data.test_dataset[idx]['_positions'].tolist()), 29)])
  val_mol_size = len(qm9data.test_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.test_dataset[idx]['_atomic_numbers'].tolist()


  lst_features_test.append(np.array([FeatureMatrix(val_atm_numbers, 29)]))
  lst_distancias_test.append(val_dist)
  lst_mol_sizes_test.append(val_mol_size)
  lst_target_test.append(val_target)

100%|██████████| 13885/13885 [02:47<00:00, 82.92it/s]


In [None]:
pdct_test = Inference(NeuralNetModel, lst_features_test, lst_distancias_test, lst_mol_sizes_test)
pdct_test_inverse_transformation = np.array(lst_mol_sizes_test)*mean_target_aux + np.array(pdct_test)*1
print("Mean Absolute Error:", np.mean(abs(np.array(lst_target_test) - pdct_test_inverse_transformation)))

Mean Absolute Error: 0.0445490558535163
