In [None]:
!pip install schnetpack

In [None]:
!pip uninstall --y numpy
!pip install numpy==1.23

# Import Libraries

In [None]:
import os
import time
import schnetpack as spk
from schnetpack.datasets import QM9
import schnetpack.transform as trn

import numpy as np
from tqdm import tqdm

from NeuralModel import AtomNeuralNet
from NeuralModel import TrainModel
from NeuralModel import Inference

Instructions for updating:
non-resource variables are not supported in the long term


# Auxliary Functions

In [None]:
def Euclidian_Distance(vec_a, vec_b):
  aux = vec_a - vec_b
  aux = aux**2
  return sum(aux)**0.5


def FeatureMatrix(simbols, Max_dim):
  elementos_dict =  {6: 0, 9: 1, 1: 2, 7: 3, 8: 4} #QM9
  matrix_features = np.zeros(shape=(Max_dim, len(elementos_dict)), dtype=float, order='C')

  dimensao = len(simbols)
  for i in range(dimensao):
    r = elementos_dict[simbols[i]]
    matrix_features[i,r] = 1

  return matrix_features

def Distance_Matrix(coordinates, max_dim):
  matrix_dist = np.zeros(shape=(max_dim, max_dim), dtype=float, order='C')

  n_atoms = coordinates.shape[0]

  for i in range(n_atoms):
    for j in range(i, n_atoms):
      vec_a = coordinates[i]
      vec_b = coordinates[j]
      dist = Euclidian_Distance(vec_a, vec_b)
      matrix_dist[i,j] = dist
      matrix_dist[j,i] = dist

  return matrix_dist

###  Choose the Property



In [None]:
molecule_property = 'homo'

# Load Data
* Here we use the data available in schnetpack.

In [None]:
if molecule_property =='homo':
  aux_prop = QM9.homo

qm9tut = './qm9tut'
if not os.path.exists('qm9tut'):
    os.makedirs(qm9tut)

qm9data = QM9(
    './qm9.db',
    batch_size=32,
    num_train=110000,
    num_val=10000,
    transforms=[
        trn.ASENeighborList(cutoff=5.),
        #trn.RemoveOffsets(aux_prop, remove_mean=True, remove_atomrefs=True),
        trn.CastTo32()
    ],
    property_units={aux_prop: 'eV'},
    num_workers=1,
    split_file=os.path.join(qm9tut, "split.npz"),
    pin_memory=True, # set to false, when not using a GPU
    load_properties=[aux_prop], #only load U0 property
)
qm9data.prepare_data()
qm9data.setup()

100%|██████████| 133885/133885 [02:09<00:00, 1033.89it/s]


# Exploring the dataset
* Checking how the data is available in the dataset

In [None]:
index_molecule = 20

In [None]:
# Atoms positions (x,y,z) coordinates
qm9data.train_dataset[index_molecule]['_positions'].tolist()

[[0.3419451117515564, 1.7519160509109497, 0.07111945003271103],
 [0.18140292167663574, 0.5398504137992859, -0.6405936479568481],
 [1.4276238679885864, 0.24202287197113037, -1.5056687593460083],
 [1.3139313459396362, -0.9606288075447083, -2.2330360412597656],
 [0.899932861328125, -0.8313520550727844, -3.5840370655059814],
 [-0.49261683225631714, -0.4136488437652588, -3.766125202178955],
 [-1.6330076456069946, -0.06889702379703522, -3.9273746013641357],
 [-0.11222393810749054, -0.5874020457267761, 0.2646366357803345],
 [-0.3142189681529999, -1.4425157308578491, 1.0149264335632324],
 [1.0038352012634277, 1.617071270942688, 0.7594314813613892],
 [-0.6796064972877502, 0.6822158694267273, -1.2985310554504395],
 [1.591538667678833, 1.1090978384017944, -2.1584224700927734],
 [2.297991991043091, 0.13994058966636658, -0.8459169268608093],
 [1.5616086721420288, -0.13305342197418213, -4.119875431060791],
 [1.0453487634658813, -1.8224289417266846, -4.025633811950684],
 [-2.64351224899292, 0.2257335

In [None]:
# atomic numbers
qm9data.train_dataset[index_molecule]['_atomic_numbers'].tolist()

[8, 6, 6, 8, 6, 6, 6, 6, 7, 1, 1, 1, 1, 1, 1, 1]

In [None]:
# Target value
print("Target", aux_prop, qm9data.train_dataset[index_molecule][aux_prop].tolist()[0])

Target homo -7.52122688293457


# Prepare Data for Model Training
* Create the features matrices - one hot encoding based on chemical element
* Create the atoms distance matrix

In [None]:
lst_features_treino = list()
lst_distancias_treino = list()
lst_mol_sizes_treino = list()
lst_target_treino = list()

lst_features_valid = list()
lst_distancias_valid = list()
lst_mol_sizes_valid = list()
lst_target_valid = list()

In [None]:
print("Loading Training data ...")
inicio = time.time()
######################## Train Data #############################
for idx in tqdm(range(int(len(qm9data.train_dataset)))):
  val_target = qm9data.train_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.train_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.train_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.train_dataset[idx]['_atomic_numbers'].tolist()


  lst_features_treino.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_treino.append(val_target)
  lst_distancias_treino.append(val_dist)
  lst_mol_sizes_treino.append(val_mol_size)

######################## Validation Data #############################
for idx in tqdm(range(int(len(qm9data.val_dataset)))):
  val_target = qm9data.val_dataset[idx][aux_prop].tolist()[0]
  val_dist = Distance_Matrix(np.array(qm9data.val_dataset[idx]['_positions'].tolist()), 29)
  val_mol_size = len(qm9data.val_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.val_dataset[idx]['_atomic_numbers'].tolist()


  lst_features_valid.append(FeatureMatrix(val_atm_numbers, 29))
  lst_target_valid.append(val_target)
  lst_distancias_valid.append(val_dist)
  lst_mol_sizes_valid.append(val_mol_size)

fim = time.time()
print("Time(s) for data loading:", fim-inicio)

Loading Training data ...


100%|██████████| 110000/110000 [21:51<00:00, 83.85it/s]
100%|██████████| 10000/10000 [01:59<00:00, 83.45it/s]

Time(s) for data loading: 1431.6578855514526





In [None]:
std_target_aux = 1

# Target Variable transformation
* not necessary for HOMO and LUMO

# Model Training

In [None]:
# The model output is the mean of the atoms contribution.

NeuralNetModel = AtomNeuralNet(predict_aggregation = "mean")

In [13]:
TrainModel(NeuralNetModel, lst_features_treino,
          lst_target_treino, lst_distancias_treino,
          lst_mol_sizes_treino, std_target_aux,
          lst_features_valid, lst_target_valid,
          lst_distancias_valid, lst_mol_sizes_valid,
          n_epochs = 600, n_batch = 64)

Epoch: 1 MAE Train: 0.25969548007520965       MAE Validation: 0.27370589874326434       Time(s): 88.96345138549805 
Epoch: 2 MAE Train: 0.3012963384224416       MAE Validation: 0.273262730751006       Time(s): 85.3915946483612 
Epoch: 3 MAE Train: 0.23631087122645383       MAE Validation: 0.2244108885944963       Time(s): 85.42012643814087   Weights Updated :-)
Epoch: 4 MAE Train: 0.18305846733854808       MAE Validation: 0.23435226871599296       Time(s): 85.41820931434631 
Epoch: 5 MAE Train: 0.2181714606724869       MAE Validation: 0.1961727310190559       Time(s): 85.47273993492126   Weights Updated :-)
Epoch: 6 MAE Train: 0.1766576707341367       MAE Validation: 0.21668339305248863       Time(s): 85.56062197685242 
Epoch: 7 MAE Train: 0.2095965280550533       MAE Validation: 0.21065057807922682       Time(s): 85.51133418083191 
Epoch: 8 MAE Train: 0.1787469962715244       MAE Validation: 0.16026766440942383       Time(s): 85.70098328590393   Weights Updated :-)
Epoch: 9 MAE Train:

# Inference
* Predict target value in test dataset

In [14]:
lst_target_test = list()
lst_features_test = list()
lst_distancias_test = list()
lst_mol_sizes_test = list()

In [15]:
for idx in tqdm(range(int(len(qm9data.test_dataset)))):

  val_target = qm9data.test_dataset[idx][aux_prop].tolist()[0]
  val_dist =  np.array([Distance_Matrix(np.array(qm9data.test_dataset[idx]['_positions'].tolist()), 29)])
  val_mol_size = len(qm9data.test_dataset[idx]['_atomic_numbers'].tolist())
  val_atm_numbers = qm9data.test_dataset[idx]['_atomic_numbers'].tolist()

  lst_features_test.append(np.array([FeatureMatrix(val_atm_numbers, 29)]))
  lst_distancias_test.append(val_dist)
  lst_mol_sizes_test.append(val_mol_size)
  lst_target_test.append(val_target)

100%|██████████| 13885/13885 [02:46<00:00, 83.38it/s]


In [16]:
pdct_test = Inference(NeuralNetModel, lst_features_test, lst_distancias_test, lst_mol_sizes_test)
pdct_test_inverse_transformation = np.array(pdct_test)
print("Mean Absolute Error:", np.mean(abs(np.array(lst_target_test) - pdct_test_inverse_transformation)))

Mean Absolute Error: 0.04429568702839181
