# Data Preprocessing


* This notebook serves the purpose of taking data from npz files and 
converting it into h5 format, so that in can be loaded dynamically 
into local memory while running the training model

* It also saves the configuration to be used further into the training


In [1]:
import os

import h5py
import os
import numpy as np
import pickle
import torchani

In [2]:
config = {
    'name' : "Uracil",
    'batches' : [35000, 70000, 105000],
    'testing' : [105000, -1],
    'share_path' : '/share1/shaunak/ML4NS/',
    'dataset_name' : 'uracil_dft.npz',
    'scratch_path' : '/scratch/shaunak/ML4NS',
    'test_h5_filename' : 'testing.h5'
}

def save_config():
    f = open('model_config', 'ab')
    pickle.dump(config, f)
    f.close()

In [3]:
model_dir = '{}/{}'.format(config['scratch_path'], config['name'])
os.system('mkdir -p {}'.format(model_dir))

from_path = '{}/{}/{}'.format(config['share_path'], "datasets", config['dataset_name'])
os.system('rsync -aPs ada:{} {}'.format(from_path, model_dir))

training_h5_path = "{}/{}.h5".format(model_dir, config['name'])
testing_h5_path = "{}/testing.h5".format(model_dir)

In [4]:
# To reset if want to run notebook again, do not do it if you don't wish 
# to delete already generated h5 file
os.system('rm {}'.format(training_h5_path))
os.system('rm {}'.format(testing_h5_path))

256

In [5]:
try:
    path = os.path.dirname(os.path.realpath(__file__))
except NameError:
    path = os.getcwd()

In [6]:
data_path = "{}/{}".format(model_dir, config['dataset_name'])
new_data_file = training_h5_path

In [7]:
molecule = np.load(data_path)
molecule_name = config['name']
batches = config['batches']
names = ["01", "02", "03"]

print("Batches ,", batches)

Batches , [35000, 70000, 105000]


In [8]:
n = molecule['E'].shape[0]
config['testing'][-1] = n
os.system('rm model_config')
save_config()
print(config)

{'name': 'Uracil', 'batches': [35000, 70000, 105000], 'testing': [105000, 133770], 'share_path': '/share1/shaunak/ML4NS/', 'dataset_name': 'uracil_dft.npz', 'scratch_path': '/scratch/shaunak/ML4NS', 'test_h5_filename': 'testing.h5'}


In [9]:
species_map = {
    6 : "C".encode("utf-8"),
    1 : "H".encode("utf-8"),
    8 : "O".encode("utf-8"),
    7 : "N".encode("utf-8"),
}
mult = 627.5094738898777

In [10]:
species = list(map(lambda x:species_map[x], molecule['z']))

In [11]:
print("Species : ", species)

Species :  [b'C', b'C', b'N', b'C', b'N', b'C', b'O', b'O', b'H', b'H', b'H', b'H']


In [12]:
h5_file = h5py.File(new_data_file, 'w')

In [13]:
if molecule_name not in h5_file:
    b = h5_file.create_group(molecule_name)
else:
    b = h5_file[molecule_name]

In [14]:
init = 0
for i in range(len(batches)):
    if names[i] not in b:
        sub_group = b.create_group(names[i])
    else:
        sub_group = b[names[i]]
    last = batches[i]
    print(init, last)
    if "coordinates" not in sub_group:
        sub_group.create_dataset("coordinates", data = molecule['R'][init:last])
    if "energies" not in sub_group:
        sub_group.create_dataset("energies", data = molecule['E'][init:last].flatten() / mult) 
    if "species" not in sub_group:
        sub_group.create_dataset("species", data = species)
    init = last

0 35000
35000 70000
70000 105000


In [15]:
print("Done with writing to h5 file : ")
for i in names:
    print(h5_file[molecule_name][i]['coordinates'])
    print(h5_file[molecule_name][i]['energies'])
    print(h5_file[molecule_name][i]['species'])
    print(" ")

Done with writing to h5 file : 
<HDF5 dataset "coordinates": shape (35000, 12, 3), type "<f8">
<HDF5 dataset "energies": shape (35000,), type "<f8">
<HDF5 dataset "species": shape (12,), type "|S1">
 
<HDF5 dataset "coordinates": shape (35000, 12, 3), type "<f8">
<HDF5 dataset "energies": shape (35000,), type "<f8">
<HDF5 dataset "species": shape (12,), type "|S1">
 
<HDF5 dataset "coordinates": shape (35000, 12, 3), type "<f8">
<HDF5 dataset "energies": shape (35000,), type "<f8">
<HDF5 dataset "species": shape (12,), type "|S1">
 


In [16]:
print("Closing h5 file ...")
h5_file.close()

Closing h5 file ...


## Saving the test data

In [17]:
init, last = config['testing']

In [18]:
print(init, last)

105000 133770


In [19]:
# test_data_file = os.path.join(path, 'tmp_testing.h5')
# test_h5_file = h5py.File(test_data_file, 'w')

In [20]:
test_data_file = testing_h5_path
os.system("rm {}".format(test_data_file))
h5_file = h5py.File(test_data_file, 'w')

In [21]:
if molecule_name not in h5_file:
    b = h5_file.create_group(molecule_name)
else:
    b = h5_file[molecule_name]

In [22]:
name = "test"

In [23]:
if name not in b:
    sub_group = b.create_group(name)
else:
    sub_group = b[name]

if "coordinates" not in sub_group:
    sub_group.create_dataset("coordinates", data = molecule['R'][init:last])
if "energies" not in sub_group:
    sub_group.create_dataset("energies", data = molecule['E'][init:last].flatten() / mult) 
if "species" not in sub_group:
    sub_group.create_dataset("species", data = species)

In [24]:
print("Done with writing to temporary h5 file : ")
h5_file.close()

Done with writing to temporary h5 file : 


In [25]:
print("Transferring generated files back to share1")

share_dir = "{}/{}".format(config['share_path'], config['name'])
os.system('rsync -aPs --rsync-path="mkdir -p {} && rsync"  {} {} ada:{}'.\
          format(share_dir, training_h5_path, testing_h5_path, share_dir))

Transferring generated files back to share1


0