## Analysing training data

In [22]:
import numpy as np
import plotly.express as px
from instance_mongodb import instance_mongodb_sei

from ase import units as ase_units

MATCHER = {
    "X": {
        "A": "F",
        "B": "Cl",
        "C": "Br",
    },
    "Y": {
        "A": "H",
        "B": "F",
        "C": "Cl",
        "D": "Br",
    },
}




In [2]:
db = instance_mongodb_sei('mlts')
collection = db['minimal_basis_interpolated_sn2']

attacking_species = []
leaving_species = []
barriers = []

for doc in collection.find({}):
    tags = doc['tags']
    label = tags['label']

    R1, R2, R3, R4, X, Y = label.split("_")

    leaving_species.append(MATCHER["X"][X])
    attacking_species.append(MATCHER["Y"][Y])

    barrier = doc['final_energy'][1] - doc['final_energy'][0]
    barriers.append(barrier)

barriers = np.array(barriers) * ase_units.Hartree


In [None]:
coeff_matrix_test = doc["coeff_matrices"]
coeff_matrix_test = np.array(coeff_matrix_test)
alpha_coeff_matrix_test = coeff_matrix_test[2,0]
sum_square_alpha = np.sum(alpha_coeff_matrix_test**2, axis=1)
print(sum_square_alpha)
# Plot an imshow of the alpha coefficients
fig = px.imshow(alpha_coeff_matrix_test)
fig.show()

In [15]:
fig = px.histogram(x=attacking_species, nbins=20, template="simple_white")
# Set x-label
fig.update_xaxes(title_text="Attacking species")
fig.update_yaxes(title_text="Count in dataset")
# Shorten aspect ratio
fig.update_layout(height=500, width=800)
fig.show()

In [16]:
fig = px.histogram(x=leaving_species, nbins=20, template="simple_white")
# Set x-label
fig.update_xaxes(title_text="Leaving species")
fig.update_yaxes(title_text="Count in dataset")
# Shorten aspect ratio
fig.update_layout(height=500, width=800)
fig.show()

In [20]:
fig = px.histogram(x=barriers, nbins=20, template="simple_white")
# Set x-label
fig.update_xaxes(title_text="Barrier height [eV]")
fig.update_yaxes(title_text="Count in dataset")
# Shorten aspect ratio
fig.update_layout(height=500, width=800)
fig.show()