## Imports

In [None]:
# Import NFF package from path

import sys
sys.path.append("./NFF")
import nff.data as nff_data

In [None]:
# general imports
import numpy as np
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split

In [None]:
# import self-written SHARC parser 
# (right now this parser assumes that there are only singlets as is the case in this tutorial)
from Sharc_Parser import SharcTrajectoryParser

In [None]:
#######################################
# cutoff to remove possible outliers from the dataset
# the enrgy cut off is lower on purpose so that some frames are removed, for the project we use 750
energy_cutoff = 450     # removes frames with energy X above/below the mean energy in kcal/mol
force_cutoff  = 1000    # removes frames with forces larger than X kcal/(mol A)
# cutoff for the neighbour list for the NFF
nbrlist_cutoff = 5.0

## Parse Files

In [None]:
Parser = SharcTrajectoryParser("TrainingSet_4000_CH2NH2+.dat")
results = Parser.parse()

In [None]:
Parser.header_data

In [None]:
# concatenate the dictionaries in the correct way
props = nff_data.concatenate_dict(*results)

# Make sure to have the correct data types
for key in props.keys():
    if (key == 'nxyz' 
        or key.startswith("energy")
        or key.startswith("NAC")):
        props[key] = torch.FloatTensor(props[key])

In [None]:
# Convert the dictionaries to a Torch dataset
dataset = nff_data.Dataset(props.copy())

In [None]:
# Check which keys are in the dataset
dataset.props.keys()

## Analysis of the dataset (in this case there should be no outliers)

In [None]:
#######################################
# determine outliers
energies = dataset.props['energy_0'].numpy().reshape(-1)
print("Total number of data points: {}".format(energies.shape[0]))

non_outliers = energies <= energy_cutoff
# now filtering the forces
for key in ['energy_0_grad', 'energy_1_grad', 'energy_2_grad']:
    forces = dataset.props[key].numpy().reshape(-1, Parser.header_data['natom'], 3)
    non_outliers = np.logical_and(non_outliers, np.all(np.all(np.abs(forces) <= force_cutoff, axis=-1), axis=-1))

energies = energies[non_outliers]
print("Number of points w/o outloiers: {}".format(energies.shape[0]))

In [None]:
# Remove energy outliers from the dataset
for key in props.keys():
    props[key] = np.delete(props[key] , ~non_outliers, axis=0).tolist()

# check whether it worked
print("Number of data points before removal: {}".format(len(dataset)))
dataset_cleaned = nff_data.Dataset(props.copy())
print("Number of data points after removal: {}".format(len(dataset_cleaned)))

# Make sure to have the correct data types
for key in props.keys():
    if (key == 'nxyz' 
        or key.startswith("energy")
        or key.startswith("NAC")
        or key.startswith("force")):
        props[key] = torch.FloatTensor(props[key])

In [None]:
# set the size of the plot, the axis labels and thickness are adjusted for this size
plt.rcParams["figure.figsize"] = [7,5]

# create the single plot
fig, ax = plt.subplots(1, sharey=True, sharex=True)

colors = ['#00429d', '#6792c0', '#fdd3cf']
en_keys = ['energy_0', 'energy_1', 'energy_2']
grad_keys = ['energy_0_grad', 'energy_1_grad', 'energy_2_grad']

# plot the current energy dataset
for color, en_key in zip(colors, en_keys):
    energy = dataset_cleaned.props[en_key].numpy().reshape(-1)
    ax.hist(energy - energies.min(), bins=100, range=[0, 475], 
            density=False, color=color, label=en_key)

#######################################

# Make the plot pretty
ax.spines['bottom'].set_linewidth(3)
ax.spines['top'].set_linewidth(3)
ax.spines['left'].set_linewidth(3)
ax.spines['right'].set_linewidth(3)

# increase tick size and make them point inwards
ax.tick_params(axis='y',length=6,width=3,labelsize=25, pad=10, direction='in')
ax.tick_params(axis='x',length=6,width=3,labelsize=25, pad=10, direction='in')

ax.set_ylabel(r'Count', fontsize=30)
ax.set_xlabel(r'$E - E_\mathrm{min}$ / kcal mol$^{-1}$', fontsize=30)
ax.legend(frameon=False, fontsize=20)

plt.tight_layout()

plt.show()

In [None]:
# create the single plot
fig, ax = plt.subplots(1, sharey=True, sharex=True)

#######################################
# plot the current energy dataset
for color, key in zip(colors, grad_keys):
    forces = torch.cat(dataset_cleaned.props[key]).numpy().reshape(-1)
    ax.hist(forces, bins=100, range=[-1200, 1200], density=False, 
            color=color, label=key)

#######################################

# Make the plot pretty
ax.spines['bottom'].set_linewidth(3)
ax.spines['top'].set_linewidth(3)
ax.spines['left'].set_linewidth(3)
ax.spines['right'].set_linewidth(3)

# increase tick size and make them point inwards
ax.tick_params(axis='y',length=6,width=3,labelsize=25, pad=10, direction='in')
ax.tick_params(axis='x',length=6,width=3,labelsize=25, pad=10, direction='in')

ax.set_ylabel(r'Count', fontsize=30)
ax.set_xlabel(r'$F_i$ / kcal mol$^{-1}$ \AA$^{-1}$', fontsize=30)
ax.legend(frameon=False, fontsize=15)

plt.tight_layout()

plt.show()

In [None]:
# add neighbour list, for PaiNN we need a directed list
dataset_cleaned.generate_neighbor_list(cutoff=nbrlist_cutoff, undirected=False)
print("neighborlist generated")
dataset_cleaned.props.keys()

In [None]:
# Last overview of the dataset
for key in en_keys:
    print("-------------------------------------------------------")
    print(key)
    print(f"Energy minimum: {np.array(dataset_cleaned.props[key]).min():4.2f} kcal /mol")
    print(f"Energy maximum: {np.array(dataset_cleaned.props[key]).max():4.2f} kcal /mol")
    print(f"Energy mean: {np.array(dataset_cleaned.props[key]).mean():4.2f} kcal /mol")


In [None]:
# split the datatset
train, val = train_test_split(dataset_cleaned, test_size=0.15, random_state=1234)

# save the dataset
props = nff_data.concatenate_dict(*train)
train_set = nff_data.Dataset(props)
train_set.save('train.pth.tar')

props = nff_data.concatenate_dict(*val)
val_set = nff_data.Dataset(props)
val_set.save('val.pth.tar')

# Redo everything for test

In [None]:
Parser = SharcTrajectoryParser("TestSet_770_CH2NH2+.dat")
results = Parser.parse()

In [None]:
# concatenate the dictionaries in the correct way
props = nff_data.concatenate_dict(*results)

# Make sure to have the correct data types
for key in props.keys():
    if (key == 'nxyz' 
        or key.startswith("energy")
        or key.startswith("NAC")):
        props[key] = torch.FloatTensor(props[key])

dataset = nff_data.Dataset(props.copy())

#######################################
# determine outliers
energies = dataset.props['energy_0'].numpy().reshape(-1)
print("Total number of data points: {}".format(energies.shape[0]))

non_outliers = energies <= energy_cutoff
# now filtering the forces
for key in ['energy_0_grad', 'energy_1_grad', 'energy_2_grad']:
    forces = dataset.props[key].numpy().reshape(-1, Parser.header_data['natom'], 3)
    non_outliers = np.logical_and(non_outliers, np.all(np.all(np.abs(forces) <= force_cutoff, axis=-1), axis=-1))

energies = energies[non_outliers]
print("Number of points w/o outloiers: {}".format(energies.shape[0]))

# Remove energy outliers from the dataset
for key in props.keys():
    props[key] = np.delete(props[key] , ~non_outliers, axis=0).tolist()

# check whether it worked
print("Number of data points before removal: {}".format(len(dataset)))
dataset_cleaned = nff_data.Dataset(props.copy())
print("Number of data points after removal: {}".format(len(dataset_cleaned)))

# Make sure to have the correct data types
for key in props.keys():
    if (key == 'nxyz' 
        or key.startswith("energy")
        or key.startswith("NAC")
        or key.startswith("force")):
        props[key] = torch.FloatTensor(props[key])

# add neighbor list
dataset_cleaned.generate_neighbor_list(cutoff=nbrlist_cutoff, undirected=False)

# Last overview of the dataset
for key in en_keys:
    print("-------------------------------------------------------")
    print(key)
    print(f"Energy minimum: {np.array(dataset_cleaned.props[key]).min():4.2f} kcal /mol")
    print(f"Energy maximum: {np.array(dataset_cleaned.props[key]).max():4.2f} kcal /mol")
    print(f"Energy mean: {np.array(dataset_cleaned.props[key]).mean():4.2f} kcal /mol")

# save the dataset
dataset_cleaned.save('test.pth.tar')