In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, layers
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
import os
os.system('mkdir -p images')

0

## Importing the dataset

In [3]:
name = 'benzene_old_dft'
mol = 'Benzene'
path_to_dataset = '../../datasets'

In [4]:
data = np.load("{}/{}.npz".format(path_to_dataset, name))

In [5]:
configs = data['R']
COM = configs.mean(axis = 1, keepdims = True)

In [8]:
configs.shape, data.files, data['z']
for i in data.files:
    print(i)
    print(data[i])


E
[[-146536.1068]
 [-146536.1192]
 [-146536.099 ]
 ...
 [-146528.199 ]
 [-146528.1944]
 [-146528.1118]]
name
b'qmC6H6'
F
[[[ 4.73847840e-11 -3.82596480e+00 -6.71719680e-13]
  [-3.24928800e+00 -1.94189760e+00 -3.43149120e-13]
  [-3.24928800e+00  1.94189760e+00  6.67416960e-13]
  ...
  [-4.16293920e-12 -6.04474560e+00  5.07123360e-13]
  [-5.21874720e+00 -3.01262400e+00  7.20295200e-14]
  [-5.21874720e+00  3.01262400e+00  1.05965424e-12]]

 [[ 1.11068640e+00 -3.48842880e+00  5.64674400e-01]
  [-2.45894400e+00 -1.13021712e+00  1.18552608e-01]
  [-2.27453760e+00  1.22346144e-01 -3.22531200e-01]
  ...
  [ 1.57695840e-02 -7.33620960e+00  1.13925168e-01]
  [-5.80842720e+00 -3.25497600e+00  1.29719232e-01]
  [-3.06809280e+00  1.98758880e+00  1.15045920e-01]]

 [[ 2.24255520e+00 -3.04914240e+00  1.14397344e+00]
  [-1.59134400e+00 -2.70688320e-01  2.42511840e-01]
  [-1.24654032e+00 -1.75278240e+00 -6.68413440e-01]
  ...
  [ 3.15537120e-02 -8.54546400e+00  2.21202720e-01]
  [-6.32345760e+00 -3.453

In [7]:
print(configs[0])

[[ 0.      1.397   0.    ]
 [ 1.2098  0.6985  0.    ]
 [ 1.2098 -0.6985  0.    ]
 [ 0.     -1.397   0.    ]
 [-1.2098 -0.6985  0.    ]
 [-1.2098  0.6985  0.    ]
 [ 0.      2.481   0.    ]
 [ 2.1486  1.2405  0.    ]
 [ 2.1486 -1.2405  0.    ]
 [ 0.     -2.481   0.    ]
 [-2.1486 -1.2405  0.    ]
 [-2.1486  1.2405  0.    ]]


In [None]:
fig = plt.figure(figsize = (15, 15))


for num in range(1):
    ax = fig.add_subplot(1, 1, num + 1, projection = '3d')
    i = configs[num]
    ax.scatter(i[:, 0], i[:, 1], i[:, 2], s = 500, c = 'red')
    ax.set_title("Energy : {}".format(data['E'][num][0]), fontsize = 50)
plt.savefig("images/{}_configurations_paper.png".format(mol))

In [None]:
total_size = int(3e5)
data_points = configs[:total_size]
data_labels = data['E'][:total_size]

## Architecturing the model

In [None]:
optimizer = tf.optimizers.Adam(0.001)
model = keras.Sequential([layers.Flatten(input_shape = (12, 3)), layers.Dense(20, activation = tf.nn.relu), 
                            layers.Dense(10, activation = tf.nn.relu),
                            layers.Dense(1)])
model.build()

In [None]:
assert model(data_points[:10]).shape == data_labels[:10].shape, "Check model architecture"

In [None]:
# model.compile(
#     optimizer=tf.optimizers.Adam(learning_rate=0.001),
#     loss='mean_absolute_error')

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(data_points, data_labels, test_size = 0.2)

In [None]:
def mean_squared_error(true, pred):
    return tf.reduce_mean(tf.math.square(true - pred))

In [None]:
os.system("mkdir -p logs")
f = open("logs/training.log", "w")

In [None]:
epochs = 5000
small_batch_size = 1000

In [None]:
f.write("Step\tError\tR2\n")
f.flush()
print("Step\tError\tR2\n")
training_data_size = train_data.shape[0]
for step in range(epochs):
    for j in range(0, training_data_size, small_batch_size):
        td = train_data[j:j + small_batch_size]
        tl = train_labels[j:j + small_batch_size]
        with tf.GradientTape() as g:
            batchx = model(td)
            error = mean_squared_error(tl, batchx)
        trainable_variables = model.trainable_variables
        gradients = g.gradient(error, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
#         if j % 6000 == 0:
#             val = False
#             for h in model.trainable_variables:
#                 val = val or tf.math.is_nan(h).numpy().any()
#             print("{}\t{}\n".format(j, val))
#         print("\n")
    if step % 10 == 0:
        td = train_data
        tl = train_labels
        pred = model(td)
        error = mean_squared_error(tl, pred)
#         print(tl, pred)
        R2 = r2_score(tl, pred)
        f.write("{}\t{}\t{}\n".format(step, error, R2))
        f.flush()
        print("{}\t{}\t{}\n".format(step, error, R2))

### Saving preprocessed test data

In [None]:
np.save('test_data.npy', test_data)
np.save('test_labels.npy', test_labels)

## Saving the model for testing later

In [None]:
model.save('./EP')

In [None]:
f.close()