In [1]:
import os.path as osp
import os
import ase.io
import ase
import numpy as np
import pandas as pd
import glob
import tqdm

import networkx as nx
from scipy.spatial.distance import cdist, pdist

import sys
sys.path.append("/home/share/DATA/TSDiff/utils")
import alignXYZ
from checkConnectivity import MolGraph, AseAtoms2MolGraph, SanitizationError, CheckConnectivity

IRC computation log directories

seed : 0 ~ 7

sample_idx : 0 ~ 1196

In [2]:
directories = glob.glob("/home/share/DATA/TSDiff/dft_results/IRC_results/ensemble_sample_all/ensemble_irc_total/seed0/sample_*")
idx_list = [int(osp.basename(dir).split("_")[1]) for dir in directories]
idx_list.sort()
idx_list == list(range(len(idx_list)))

True

## Checklist

For each log file, we need to check the things

1. Has the TS converged correctly? (1st-order saddle point?)

2. Has the IRC computation has converged?

3. The forward and backward equilibrium points are consistent with the corresponding reaction graph?

### TS optimization convergence

In [3]:
def check_tsopt_convergence(log_file):
    if not osp.isfile(log_file):
        return False
    
    with open(log_file, "r") as f:
        lines = f.readlines()
    # check if the optimization is converged
    conv = False
    for line in lines:
        if "OPTIMIZATION HAS CONVERGED" in line:
            conv = True
    
    if not conv:
        return False
    
    # check the number of imaginary frequency
    freq_lines = "".join(lines).split("VIBRATIONAL FREQUENCIES")[-1]
    imag_cnt = 0
    for line in freq_lines.split("\n"):
        if "***imaginary mode***" in line:
            imag_cnt += 1
    conv = (imag_cnt == 1) and conv
    return conv

tsopt_convergence_result = []
for IDX in tqdm.tqdm(range(1197)):
    data_ = {"sample_index": IDX}
    for SEED in range(8):
        tsopt_log = f"/home/share/DATA/TSDiff/dft_results/tsopt_result/ensemble_sample_all/seed{SEED}_even/sample_{IDX}/log"
        check = check_tsopt_convergence(tsopt_log)
        data_[SEED] = check
    tsopt_convergence_result.append(data_)
tsopt_convergence_result = pd.DataFrame(tsopt_convergence_result)

100%|██████████| 1197/1197 [14:20<00:00,  1.39it/s]


### IRC computation convergence

In [4]:
def check_IRC_convergence(log_file):
    if not osp.isfile(log_file):
        return False
    
    with open(log_file, "r") as f:
        lines = f.readlines()
    
    cnt = 0
    for line in lines:
        if "THE IRC HAS CONVERGED" in line:
            cnt += 1

    if cnt == 2:
        return True
    else:
        return False

irc_convergence_result = []
for IDX in tqdm.tqdm(range(1197)):
    data_ = {"sample_index": IDX}
    for SEED in range(8):
        irc_log = f"/home/share/DATA/TSDiff/dft_results/IRC_results/ensemble_sample_all/ensemble_irc_total/seed{SEED}/sample_{IDX}/log"
        check = check_IRC_convergence(irc_log)
        data_[SEED] = check
    irc_convergence_result.append(data_)
irc_convergence_result = pd.DataFrame(irc_convergence_result)

100%|██████████| 1197/1197 [02:24<00:00,  8.31it/s]


### Consistency between IRC end-points and reaction graph

In [5]:
def check_IRC_consistency(xyz_file, r_atoms, p_atoms):
    if not osp.isfile(xyz_file):
        return False
    
    return CheckConnectivity(xyz_file, r_atoms, p_atoms)

# check the connectivity of the R, P and end points
refRxyz = "/home/share/DATA/TSDiff/data/TS/wb97xd3/random_split_42/sorted_testset/wb97xd3_r_test_nodollar_even.xyz"
refPxyz = "/home/share/DATA/TSDiff/data/TS/wb97xd3/random_split_42/sorted_testset/wb97xd3_p_test_nodollar_even.xyz"
R_atoms = list(ase.io.iread(refRxyz))
P_atoms = list(ase.io.iread(refPxyz))

irc_consistency_result = []
for IDX in tqdm.tqdm(range(1197)):
    data_ = {"sample_index": IDX}
    for SEED in range(8):
        irc_xyz_file = f"/home/share/DATA/TSDiff/dft_results/IRC_results/ensemble_sample_all/ensemble_irc_total/seed{SEED}/sample_{IDX}/input_IRC_Full_trj.xyz"
        check = check_IRC_consistency(irc_xyz_file, R_atoms[IDX], P_atoms[IDX])
        data_[SEED] = check
    irc_consistency_result.append(data_)
irc_consistency_result = pd.DataFrame(irc_consistency_result)

100%|██████████| 1197/1197 [02:44<00:00,  7.30it/s]


In [6]:
cond1 = tsopt_convergence_result[[0, 1, 2, 3, 4, 5, 6, 7]].to_numpy()
cond2 = irc_convergence_result[[0, 1, 2, 3, 4, 5, 6, 7]].to_numpy()
cond3 = irc_consistency_result[[0, 1, 2, 3, 4, 5, 6, 7]].to_numpy()

cond = np.logical_and(np.logical_and(cond1, cond2), cond3)

In [7]:
selected_samples = []
for IDX in range(1197):
    if not np.any(cond[IDX]):
        continue
    _ = {'idx': IDX, 'seed': tuple(np.where(cond[IDX])[0])}
    selected_samples.append(_)
selected_samples = pd.DataFrame(selected_samples)
selected_samples

Unnamed: 0,idx,seed
0,0,"(0,)"
1,1,"(0,)"
2,2,"(0,)"
3,3,"(0,)"
4,4,"(0,)"
...,...,...
1017,1191,"(0,)"
1018,1193,"(0,)"
1019,1194,"(0,)"
1020,1195,"(0,)"


In [8]:
def distinguish_conformers(smarts, xyz_files, log_files):
    matches = alignXYZ.get_substruct_matches(smarts)
    positions = []
    for xyz in xyz_files:
        atoms = ase.io.read(xyz)
        positions.append(atoms.positions)

    distance_matrix = np.zeros((len(positions), len(positions)))
    for i in range(len(positions)):
        for j in range(len(positions)):
            if i == j:
                continue
            match_perm, origin, dist = alignXYZ.get_min_dmae_match(matches, positions[i], positions[j])
            distance_matrix[i, j] = dist

    # check energies from log_files
    hartree2kcalmol = 627.509
    energies = []
    for log in log_files:
        with open(log, "r") as f:
            lines = f.readlines()
            for line in lines[::-1]:
                if "Total thermal energy" in line:
                    energy = float(line.split()[-2]) * hartree2kcalmol
                    energies.append(energy)
                    break

    energy_diff_matrix = np.zeros((len(energies), len(energies)))
    for i in range(len(energies)):
        for j in range(len(energies)):
            if i == j:
                continue
            energy_diff = abs(energies[i] - energies[j])
            energy_diff_matrix[i, j] = energy_diff

    edge = np.logical_and(distance_matrix < 0.01, energy_diff_matrix < 0.1)
    for i in range(len(edge)):
        edge[i, i] = False
    
    G = nx.Graph()
    for i in range(len(edge)):
        G.add_node(i)
    for i in range(len(edge)):
        for j in range(len(edge)):
            if edge[i, j]:
                G.add_edge(i, j)
    # find connected components
    connected_components = list(nx.connected_components(G))
    # pos = nx.spring_layout(G)
    # nx.draw(G, pos, with_labels=True)
    # plt.show()
    return tuple([list(connected)[0] for connected in connected_components])

infos = pd.read_csv('/home/share/DATA/TSDiff/data/TS/wb97xd3/random_split_42/sorted_testset/wb97xd3_testset_info.csv')
pruned_samples = []
for IDX in tqdm.tqdm(selected_samples['idx']):
    seeds = selected_samples[selected_samples['idx'] == IDX]['seed'].values[0]
    if len(seeds) == 1:
        _ = {'idx': IDX, 'seed': seeds, 'grambow_index': infos.iloc[2 * IDX]['log_index'], "smarts": infos.AAM[2 * IDX]}
    else:
        xyz_files = []
        for seed in seeds:
            xyz = f"/home/share/DATA/TSDiff/dft_results/tsopt_result/ensemble_sample_all/seed{seed}_even/sample_{IDX}/input.xyz"
            xyz_files.append(xyz)
        log_files = []
        for seed in seeds:
            log = f"/home/share/DATA/TSDiff/dft_results/tsopt_result/ensemble_sample_all/seed{seed}_even/sample_{IDX}/log"
            log_files.append(log)
        smarts = infos.AAM[::2].to_list()[IDX]
        
        seeds = distinguish_conformers(smarts, xyz_files, log_files)
        _ = {'idx': IDX, 'seed': seeds, 'grambow_index': infos.iloc[2 * IDX]['log_index'], "smarts": infos.AAM[2 * IDX]}
    pruned_samples.append(_)
pruned_samples = pd.DataFrame(pruned_samples)

100%|██████████| 1022/1022 [01:09<00:00, 14.62it/s]


In [9]:
save_dir = "/home/share/DATA/NeuralOpt/SQM_data/TSDifftoDFT"
os.makedirs(save_dir, exist_ok=True)
cnt = 0
test_dataset_index = []
for sample in pruned_samples.iloc:
    sample.idx, sample.seed, sample.grambow_index, sample.smarts
    IDX = sample.idx
    for SEED in sample.seed:
        traj = f"/home/share/DATA/TSDiff/dft_results/tsopt_result/ensemble_sample_all/seed{SEED}_even/sample_{IDX}/input_trj.xyz"
        atoms = list(ase.io.iread(traj))
        opt_atoms = atoms[-1]
        init_atoms = atoms[0]

        opt_log = f"/home/share/DATA/TSDiff/dft_results/tsopt_result/ensemble_sample_all/seed{SEED}_even/sample_{IDX}/input.opt"
        with open(opt_log, "r") as f:
            lines = f.read()
        for sec in lines.split('$'):
            if "energies" in sec:
                energies = sec.split("\n")[2:-2]
        opt_energy = float(energies[-1])
        init_energy = float(energies[0])
        
        comment = f'idx={cnt} smarts="{sample.smarts}" sample_index={IDX} seed={SEED} grambow_index={sample.grambow_index} init_energy={init_energy} opt_energy={opt_energy}'
        save_xyz = osp.join(save_dir, f"idx{cnt}.xyz")
        ase.io.write(save_xyz, opt_atoms, comment=comment)
        ase.io.write(save_xyz, init_atoms, comment=comment, append=True)
        test_dataset_index.append(cnt)
        cnt += 1
        
grambow_index_dict = "/home/share/DATA/TSDiff/data/TS/wb97xd3/random_split_42/index_list.pkl"
grambow_index_dict = pd.read_pickle(grambow_index_dict)
train_index = grambow_index_dict['train_index']
valid_index = grambow_index_dict['valid_index']

dataset = list(ase.io.iread("/home/share/DATA/TSDiff/data/TS/wb97xd3/raw_data/wb97xd_nodollar_ts.xyz"))
dataset_info = pd.read_csv("/home/share/DATA/TSDiff/data/TS/wb97xd3/raw_data/wb97xd_fwd_rev_chemprop.csv")

train_dataset_index = []
for grambow_index in train_index:
    smarts = dataset_info.AAM[grambow_index]
    comment = f'idx={cnt} smarts="{smarts}" grambow_index={grambow_index}'
    atoms = dataset[grambow_index]
    save_xyz = osp.join(save_dir, f"idx{cnt}.xyz")
    ase.io.write(save_xyz, atoms, comment=comment)
   
    train_dataset_index.append(cnt)
    cnt += 1

valid_dataset_index = []
for grmabow_index in valid_index:
    smarts = dataset_info.AAM[grambow_index]
    save_xyz = osp.join(save_dir, f"idx{cnt}.xyz")
    comment = f'idx={cnt} smarts="{smarts}" grambow_index={grambow_index}'
    atoms = dataset[grambow_index]
    ase.io.write(save_xyz, atoms, comment=comment)
    
    valid_dataset_index.append(cnt)
    cnt += 1

dataset_index_dict = {"train_index": train_dataset_index, "valid_index": valid_dataset_index, "test_index": test_dataset_index}
save_pkl = osp.join(save_dir, "data_split.pkl")
pd.to_pickle(dataset_index_dict, save_pkl)