In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import pearsonr
from ase import Atoms
import glob
import os.path as osp
from ase.build import minimize_rotation_and_translation

In [2]:
# json file read. (MMFFstructure optimization, Modelstructure optimization)
# check the optimization converges to the same structure.
# then 
# 1) check the number of optimization steps between MMFF and Model.
# 2) check the number of SCF steps between MMFF and Model.
# 3) check the total cpu time between MMFF and Model.

In [3]:
json_file = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/idx110/input.json"
log_file = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/idx110/input.log"

def read_log(json_file, log_file):
    if osp.exists(json_file) == False:
        return None
    
    with open(json_file) as f:
        json_data = json.load(f)
    # check json file
    opt_done = json_data["optimization"]["done"]
    scf_energies = json_data["optimization"]["scf"]["scf energies"]
    xyz = np.array(json_data["atoms"]["coords"]["3d"]).reshape(-1, 3)
    symb = json_data["atoms"]["elements"]["number"]
    atoms = Atoms(symbols=symb, positions=xyz)
    
    data = {"opt_done": opt_done, "scf_energies": scf_energies, "atoms": atoms}
    if not opt_done:
        return data
    
    # check log file
    with open(log_file) as f:
        lines = f.readlines()
    # the index of SCF info line
    line_index = [i for i, line in enumerate(lines) if "SCF Done" in line]
    # number of scf cycles
    scf_cycles = []
    scf_cpu_times = []
    for i in line_index:
        line = lines[i]
        comp = line.strip().split()
        if comp[-1] != "cycles":
            print(log_file)
            raise ValueError("SCF Done line is not correct.")
        assert comp[-1] == "cycles"
        assert lines[i+3].strip().split()[-2] == "cpu:"
        
        n_cycle = int(comp[-2])
        cpu_time = float(lines[i+3].strip().split()[-1])
        scf_cycles.append(n_cycle)
        scf_cpu_times.append(cpu_time)
        
    line = lines[-3]
    # line = "Job cpu time:       0 days  0 hours 33 minutes  0.7 seconds."
    if not "Job cpu time" in line:
        return data
    total_cpu_time = line.split()[-8:]
    total_cpu_time = [float(x) for x in total_cpu_time[::2]]
    assert len(total_cpu_time) == 4
    total_cpu_time = total_cpu_time[0]*24*60*60 + total_cpu_time[1]*60*60 + total_cpu_time[2]*60 + total_cpu_time[3]
    
    data["scf_cycles"] = scf_cycles
    data["scf_cpu_times"] = scf_cpu_times
    data["total_cpu_time"] = total_cpu_time
    return data

In [4]:
files = glob.glob("/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/*")
idx_list = [osp.basename(x) for x in files]
base_dir = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/"
data_type = ["initial", "generated"]
data_dirs = [osp.join(base_dir, x, 'calculation_log') for x in data_type]

initial_data = []
generated_data = []

for idx_ in idx_list:
    json_file = osp.join(data_dirs[0], idx_, "input.json")
    log_file = osp.join(data_dirs[0], idx_, "input.log")
    data1 = read_log(json_file, log_file)
    json_file = osp.join(data_dirs[1], idx_, "input.json")
    log_file = osp.join(data_dirs[1], idx_, "input.log")
    data2 = read_log(json_file, log_file)
    if data1 is None or data2 is None:
        continue
    data1["idx"] = idx_
    data2["idx"] = idx_
    initial_data.append(data1)
    generated_data.append(data2)

In [8]:
def check_convergence(data1, data2, threshold=1e-3):
    if not data1["opt_done"] or not data2["opt_done"]:
        return False
    
    a1 = data1["atoms"].copy()
    a2 = data2["atoms"].copy()
    minimize_rotation_and_translation(a1, a2)
    rmsd = np.sqrt(np.mean((a1.positions - a2.positions)**2))
    if rmsd < threshold:
        return True

for d1, d2 in zip(initial_data, generated_data):  
    idx = d1["idx"]
    assert d1["idx"] == d2["idx"]
    
    check = check_convergence(d1, d2)
    if check:
        d1["check"] = True
        d2["check"] = True
    else:
        d1["check"] = False
        d2["check"] = False
        
init_df = pd.DataFrame(initial_data)
gen_df = pd.DataFrame(generated_data)


# set seed for reproducibility
np.random.seed(0)
init_df = init_df.sample(1000).copy()
gen_df = gen_df.sample(1000).copy()

In [9]:
# update dataframes
# 1) number of optimization steps = len(scf_cycles)
# 2) number of SCF steps = sum(scf_cycles)
# 3) scf time = sum(scf_cpu_times)
init_df["n_opt_steps"] = init_df["scf_cycles"].apply(len)
init_df["n_scf_steps"] = init_df["scf_cycles"].apply(sum)
init_df["scf_time"] = init_df["scf_cpu_times"].apply(sum)

gen_df["n_opt_steps"] = gen_df["scf_cycles"].apply(len)
gen_df["n_scf_steps"] = gen_df["scf_cycles"].apply(sum)
gen_df["scf_time"] = gen_df["scf_cpu_times"].apply(sum)

init_df = init_df[init_df.check == True].copy()
gen_df = gen_df[gen_df.check == True].copy()



In [10]:
# visualize statistics of the data. by printing

print("Number of Optimization steps")
init_stat = init_df.n_opt_steps.describe().apply(lambda x: round(x, 2))
gen_stat = gen_df.n_opt_steps.describe().apply(lambda x: round(x, 2))
stats = pd.concat([init_stat, gen_stat], axis=1)
stats.columns = ["MMFF", "R-DSM"]
print(stats)

print("\nNumber of SCF steps")
init_stat = init_df.n_scf_steps.describe().apply(lambda x: round(x, 2))
gen_stat = gen_df.n_scf_steps.describe().apply(lambda x: round(x, 2))
stats = pd.concat([init_stat, gen_stat], axis=1)
stats.columns = ["MMFF", "R-DSM"]
print(stats)

print("\nSCF time")
init_stat = init_df.scf_time.describe().apply(lambda x: round(x, 2))
gen_stat = gen_df.scf_time.describe().apply(lambda x: round(x, 2))
stats = pd.concat([init_stat, gen_stat], axis=1)
stats.columns = ["MMFF", "R-DSM"]
print(stats)


Number of Optimization steps
         MMFF   R-DSM
count  942.00  948.00
mean     9.81    5.24
std      3.56    3.00
min      3.00    3.00
25%      7.00    4.00
50%      9.00    4.00
75%     12.00    6.00
max     27.00   39.00

Number of SCF steps
         MMFF   R-DSM
count  942.00  948.00
mean    93.18   45.03
std     39.30   31.28
min     25.00   20.00
25%     65.00   29.00
50%     83.50   35.00
75%    110.00   48.00
max    292.00  348.00

SCF time
          MMFF    R-DSM
count   942.00   948.00
mean   1219.42   614.43
std     536.18   407.51
min      47.90   107.50
25%     852.07   409.77
50%    1124.45   528.20
75%    1495.07   679.88
max    3990.80  5725.80
