In [3]:
import pandas as pd 

import json
import h5py
import ase 
from ase import io
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.analysis.graphs import MoleculeGraph
from pymatgen.core import Molecule, Structure

elem_to_num = {
    "H": 1,
    "He": 2,
    "Li": 3,
    "Be": 4,
    "B": 5,
    "C": 6,
    "N": 7,
    "O": 8,
    "F": 9,
    "Ne": 10,
    "Na": 11,
    "Mg": 12,
    "Al": 13,
    "Si": 14,
    "P": 15,
    "S": 16,
    "Cl": 17
}    

num_to_elem = {v: k for k, v in elem_to_num.items()}

In [4]:
df = pd.read_json("/home/santiagovargas/dev/berkeley_pes/data/test_libe.json")
file_xyz = "/home/santiagovargas/dev/SpookyNet/data/10/rad_qm9_traj_subset_train10.xyz"
statistics = "/home/santiagovargas/dev/SpookyNet/data/10/h5/statistics.json"
h5_test = '/home/santiagovargas/dev/SpookyNet/data/10/h5/train/train_0.h5'

train_chunk_5 = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_train_subset_0.05.xyz"
train_chunk_10 = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_train_subset_0.1.xyz"
train_chunk_25 = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_train_subset_0.25.xyz"
validation = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_val.xyz"
test = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_test.xyz"
ood = "/home/santiagovargas/Downloads/radqm9_65_10_25_trajectory_full_data_20240807_ood.xyz"



In [4]:
# items needed 

In [5]:
df[["molecule", "energy", "gradient"]].iloc[0]["molecule"]
row = df.iloc[0]
spin = row["molecule"]["spin_multiplicity"]
charge = row["molecule"]["charge"]
force = row["gradient"]
energy = row["energy"]

In [6]:

pos_elem_list = [(elem_to_num[i["name"]], i["xyz"]) for i in row["molecule"]["sites"]] 
elem = [i[0] for i in pos_elem_list]
pos = [i[1] for i in pos_elem_list]

In [14]:

def convert_to_rows(file_xyz):
    atoms = io.read(file_xyz, index=':')
    list_of_rows = []
    for atom in atoms: 
        
        info = atom.info
        full_info = atom.__dict__["arrays"]
        spin = info['spin']
        charge = info['charge']
        relative_energy = info['relative_energy']
        #total_energy = info['total_energy']
        #resp_dipole_moments = info['calc_resp_dipole_moments']
        atomic_num = atom.get_atomic_numbers()
        forces = atom.get_forces()
        pos = atom.get_positions()
        mulliken_partial_charges = full_info['mulliken_partial_charges']
        mulliken_partial_spins = full_info['mulliken_partial_spins']
        #resp_partial_charges = full_info['resp_partial_charges']

        dict_temp = {
            "sites": [
                {
                #'species': [{'element': num_to_elem[atomic_num[i]], 'occu': 1}],
                'xyz': pos[i],
                'name': num_to_elem[atomic_num[i]],
                'properties': {}
                }
                for i in range(len(atomic_num))
            ],
        }
        dict_temp["charge"] = charge
        dict_temp["spin_multiplicity"] = spin
        
        dict_total = {
            "molecule": dict_temp,
            #"energy": total_energy,
            "gradient": forces, 
            "relative_energy": relative_energy,
            #"calc_resp_dipole_moments": resp_dipole_moments,
            "mulliken_partial_charges": mulliken_partial_charges,
            "mulliken_partial_spins": mulliken_partial_spins,
            #"resp_partial_charges": resp_partial_charges
        }
        list_of_rows.append(dict_total)
    
    # convert to dataframe 
    return pd.DataFrame(list_of_rows)

df = convert_to_rows(train_chunk_5)
# save
df.to_json("train_chunk_5_radqm9_20240807.json")

In [5]:

file_xyz = train_chunk_5
atoms = io.read(file_xyz, index=':')
list_of_rows = []
for atom in atoms: 
    
    info = atom.info
    full_info = atom.__dict__["arrays"]
    spin = info['spin']
    charge = info['charge']
    relative_energy = info['relative_energy']
    #total_energy = info['total_energy']
    #resp_dipole_moments = info['calc_resp_dipole_moments']
    atomic_num = atom.get_atomic_numbers()
    forces = atom.get_forces()
    pos = atom.get_positions()
    mulliken_partial_charges = full_info['mulliken_partial_charges']
    mulliken_partial_spins = full_info['mulliken_partial_spins']
    
    dict_temp = {
        "sites": [
            {
            #'species': [{'element': num_to_elem[atomic_num[i]], 'occu': 1}],
            'xyz': pos[i],
            'name': num_to_elem[atomic_num[i]],
            'properties': {}
            }
            for i in range(len(atomic_num))
        ],
    }
    dict_temp["charge"] = charge
    dict_temp["spin_multiplicity"] = spin
    
    dict_total = {
        "molecule": dict_temp,
        #"energy": total_energy,
        "gradient": forces, 
        "relative_energy": relative_energy,
        #"calc_resp_dipole_moments": resp_dipole_moments,
        "mulliken_partial_charges": mulliken_partial_charges,
        "mulliken_partial_spins": mulliken_partial_spins,
        #"resp_partial_charges": resp_partial_charges
    }

    break 

full_info


{'numbers': array([7, 6, 8, 8, 6, 6, 9, 9, 9, 1, 1, 1, 1]),
 'positions': array([[-0.16878954,  1.20249863, -0.04144335],
        [-0.0987566 , -0.12505141,  0.23721661],
        [ 0.55867907, -0.63571692,  1.1090262 ],
        [-0.86547791, -0.81374151, -0.67056466],
        [-0.94176041, -2.21389414, -0.47033705],
        [-2.28648015, -2.56663258,  0.14898815],
        [-3.3061026 , -2.22827849, -0.65013098],
        [-2.35069852, -3.88858071,  0.36197702],
        [-2.46459547, -1.94311631,  1.31851042],
        [ 0.22127548,  1.83095585,  0.63868941],
        [-0.88433481,  1.54121139, -0.66111086],
        [-0.86608158, -2.70271344, -1.44397605],
        [-0.14504256, -2.55092205,  0.19495526]]),
 'mulliken_partial_charges': array([ 0.268207,  1.162587, -0.296609, -0.835803,  0.458088,  1.350123,
        -0.41522 , -0.411446, -0.428372,  0.09002 ,  0.022306, -0.136292,
         0.172412]),
 'mulliken_partial_spins': array([ 6.77695e-01, -1.48258e-01,  4.94031e-01, -4.23600e-03,
 

In [None]:



df = convert_to_rows(train_chunk_5)
# save
df.to_json("train_chunk_5_radqm9_20240807.json")

In [17]:
df = convert_to_rows(train_chunk_10)
df.to_json("train_chunk_10_radqm9_20240807.json")

In [18]:
df = convert_to_rows(train_chunk_25)
df.to_json("train_chunk_25_radqm9_20240807.json")

In [19]:
df = convert_to_rows(test)
df.to_json("test_radqm9_20240807.json")

In [20]:
df = convert_to_rows(ood)
df.to_json("ood_radqm9_20240807.json")

In [21]:
df = convert_to_rows(validation)
df.to_json("validation_radqm9_20240807.json")