In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import pearsonr
from ase import Atoms
import glob
import os.path as osp
from ase.build import minimize_rotation_and_translation
import ase.io

In [2]:
# json file read. (MMFFstructure optimization, Modelstructure optimization)
# check the optimization converges to the same structure.
# then 
# 1) check the number of optimization steps between MMFF and Model.
# 2) check the number of SCF steps between MMFF and Model.
# 3) check the total cpu time between MMFF and Model.

In [3]:
json_file = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/idx110/input.json"
log_file = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/idx110/input.log"

def read_log(json_file, log_file):
    if osp.exists(json_file) == False:
        return None
    
    with open(json_file) as f:
        json_data = json.load(f)
    # check json file
    opt_done = json_data["optimization"]["done"]
    scf_energies = json_data["optimization"]["scf"]["scf energies"][:-1]
    xyz = np.array(json_data["atoms"]["coords"]["3d"]).reshape(-1, 3)
    symb = json_data["atoms"]["elements"]["number"]
    atoms = Atoms(symbols=symb, positions=xyz)
    
    data = {"opt_done": opt_done, "scf_energies": scf_energies, "atoms": atoms}
    if not opt_done:
        return data
    
    # check log file
    with open(log_file) as f:
        lines = f.readlines()
    # the index of SCF info line
    line_index = [i for i, line in enumerate(lines) if "SCF Done" in line]
    # number of scf cycles
    scf_cycles = []
    scf_cpu_times = []
    for i in line_index:
        line = lines[i]
        comp = line.strip().split()
        if comp[-1] != "cycles":
            print(log_file)
            raise ValueError("SCF Done line is not correct.")
        assert comp[-1] == "cycles"
        assert lines[i+3].strip().split()[-2] == "cpu:"
        
        n_cycle = int(comp[-2])
        cpu_time = float(lines[i+3].strip().split()[-1])
        scf_cycles.append(n_cycle)
        scf_cpu_times.append(cpu_time)
        
    line = lines[-3]
    # line = "Job cpu time:       0 days  0 hours 33 minutes  0.7 seconds."
    if not "Job cpu time" in line:
        return data
    total_cpu_time = line.split()[-8:]
    total_cpu_time = [float(x) for x in total_cpu_time[::2]]
    assert len(total_cpu_time) == 4
    total_cpu_time = total_cpu_time[0]*24*60*60 + total_cpu_time[1]*60*60 + total_cpu_time[2]*60 + total_cpu_time[3]
    
    data["scf_cycles"] = scf_cycles[:-1]
    data["scf_cpu_times"] = scf_cpu_times[:-1]
    data["total_cpu_time"] = total_cpu_time
    return data

In [4]:
files = glob.glob("/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/initial/calculation_log/*")
idx_list = [osp.basename(x) for x in files]
base_dir = "/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/results/randomly_select/"
data_type = ["initial", "generated"]
data_dirs = [osp.join(base_dir, x, 'calculation_log') for x in data_type]

initial_data = []
generated_data = []

for idx_ in idx_list:
    json_file = osp.join(data_dirs[0], idx_, "input.json")
    log_file = osp.join(data_dirs[0], idx_, "input.log")
    data1 = read_log(json_file, log_file)
    json_file = osp.join(data_dirs[1], idx_, "input.json")
    log_file = osp.join(data_dirs[1], idx_, "input.log")
    data2 = read_log(json_file, log_file)
    if data1 is None or data2 is None:
        continue
    data1["idx"] = idx_
    data2["idx"] = idx_
    initial_data.append(data1)
    generated_data.append(data2)

In [5]:
def check_convergence(data1, data2, threshold=1e-2):
    if not data1["opt_done"] or not data2["opt_done"]:
        return False
    
    a1 = data1["atoms"].copy()
    a2 = data2["atoms"].copy()
    minimize_rotation_and_translation(a1, a2)
    rmsd = np.sqrt(np.mean((a1.positions - a2.positions)**2))
    if rmsd < threshold:
        return True

for d1, d2 in zip(initial_data, generated_data):  
    idx = d1["idx"]
    assert d1["idx"] == d2["idx"]
    
    check = check_convergence(d1, d2)
    if check:
        d1["check"] = True
        d2["check"] = True
    else:
        d1["check"] = False
        d2["check"] = False
        
init_df = pd.DataFrame(initial_data)
gen_df = pd.DataFrame(generated_data)


# set seed for reproducibility
np.random.seed(0)
N = len(init_df)
random_index = np.random.choice(N, 1000)
init_df = init_df.iloc[random_index].copy()
gen_df = gen_df.iloc[random_index].copy()

In [6]:
# update dataframes
# 1) number of optimization steps = len(scf_cycles)
# 2) number of SCF steps = sum(scf_cycles)
# 3) scf time = sum(scf_cpu_times)
init_df["n_opt_steps"] = init_df["scf_cycles"].apply(len)
init_df["n_scf_steps"] = init_df["scf_cycles"].apply(sum)
init_df["scf_time"] = init_df["scf_cpu_times"].apply(sum)

gen_df["n_opt_steps"] = gen_df["scf_cycles"].apply(len)
gen_df["n_scf_steps"] = gen_df["scf_cycles"].apply(sum)
gen_df["scf_time"] = gen_df["scf_cpu_times"].apply(sum)

init_df = init_df[init_df.check == True].copy()
gen_df = gen_df[gen_df.check == True].copy()



In [7]:
# visualize statistics of the data. by printing

print("Number of Function Call (NFC)")
init_stat = init_df.n_opt_steps.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.n_opt_steps.describe().apply(lambda x: round(x, 1))
stats1 = pd.concat([init_stat, gen_stat], axis=1)
stats1.columns = ["NFC_MMFF", "NFC_R-DSM"]

print("Number of SCF Cycles (NSC)")
init_stat = init_df.n_scf_steps.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.n_scf_steps.describe().apply(lambda x: round(x, 1))
stats2 = pd.concat([init_stat, gen_stat], axis=1)
stats2.columns = ["NSC_MMFF", "NSC_R-DSM"]

print("SCF time (SCFt)")
init_stat = init_df.scf_time.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.scf_time.describe().apply(lambda x: round(x, 1))
stats3 = pd.concat([init_stat, gen_stat], axis=1)
stats3.columns = ["SCFt_MMFF", "SCFt_R-DSM"]

stats = pd.concat([stats1, stats2, stats3], axis=1)
print(stats)
stats.to_csv("optimization_cost_stats_case1.csv")

Number of Function Call (NFC)
Number of SCF Cycles (NSC)
SCF time (SCFt)
       NFC_MMFF  NFC_R-DSM  NSC_MMFF  NSC_R-DSM  SCFt_MMFF  SCFt_R-DSM
count     964.0      964.0     964.0      964.0      964.0       964.0
mean        9.1        4.4      95.4       45.6     1200.0       573.0
std         3.6        3.0      39.8       31.9      547.3       409.1
min         3.0        2.0      31.0       19.0      166.9        90.9
25%         7.0        3.0      67.0       29.0      850.2       356.4
50%         8.0        3.0      87.0       35.0     1110.0       472.5
75%        11.0        5.0     114.0       49.0     1431.5       643.5
max        27.0       31.0     293.0      302.0     4725.6      3998.4


In [8]:
def check_convergence(data1, data2, threshold=1e-2):
    if not data1["opt_done"] or not data2["opt_done"]:
        return data1["opt_done"], data2["opt_done"]
    # reference atoms path: /home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/idx110.xyz
    reference_path = f"/home/share/DATA/NeuralOpt/QM9M_results/save_xyz.morse.128steps/{data1['idx']}.xyz"
    ref = list(ase.io.iread(reference_path))[0]
    
    a1 = data1["atoms"].copy()
    a2 = data2["atoms"].copy()
    minimize_rotation_and_translation(ref, a1)
    rmsd1 = np.sqrt(np.mean((ref.positions - a1.positions)**2))
    minimize_rotation_and_translation(ref, a2)
    rmsd2 = np.sqrt(np.mean((ref.positions - a2.positions)**2))
    return rmsd1 < threshold, rmsd2 < threshold

for d1, d2 in zip(initial_data, generated_data):  
    idx = d1["idx"]
    assert d1["idx"] == d2["idx"]
    
    c1, c2 = check_convergence(d1, d2)
    d1["check"] = c1
    d2["check"] = c2
        
init_df = pd.DataFrame(initial_data)
gen_df = pd.DataFrame(generated_data)


# set seed for reproducibility
np.random.seed(0)
N = len(init_df)
random_index = np.random.choice(N, 1000)
init_df = init_df.iloc[random_index].copy()
gen_df = gen_df.iloc[random_index].copy()

In [9]:
# update dataframes
# 1) number of optimization steps = len(scf_cycles)
# 2) number of SCF steps = sum(scf_cycles)
# 3) scf time = sum(scf_cpu_times)
init_df["n_opt_steps"] = init_df["scf_cycles"].apply(len)
init_df["n_scf_steps"] = init_df["scf_cycles"].apply(sum)
init_df["scf_time"] = init_df["scf_cpu_times"].apply(sum)

gen_df["n_opt_steps"] = gen_df["scf_cycles"].apply(len)
gen_df["n_scf_steps"] = gen_df["scf_cycles"].apply(sum)
gen_df["scf_time"] = gen_df["scf_cpu_times"].apply(sum)

init_df = init_df[init_df.check == True].copy()
gen_df = gen_df[gen_df.check == True].copy()



In [10]:
# visualize statistics of the data. by printing

print("Number of Function Call (NFC)")
init_stat = init_df.n_opt_steps.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.n_opt_steps.describe().apply(lambda x: round(x, 1))
stats1 = pd.concat([init_stat, gen_stat], axis=1)
stats1.columns = ["NFC_MMFF", "NFC_R-DSM"]

print("Number of SCF Cycles (NSC)")
init_stat = init_df.n_scf_steps.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.n_scf_steps.describe().apply(lambda x: round(x, 1))
stats2 = pd.concat([init_stat, gen_stat], axis=1)
stats2.columns = ["NSC_MMFF", "NSC_R-DSM"]

print("SCF time (SCFt)")
init_stat = init_df.scf_time.describe().apply(lambda x: round(x, 1))
gen_stat = gen_df.scf_time.describe().apply(lambda x: round(x, 1))
stats3 = pd.concat([init_stat, gen_stat], axis=1)
stats3.columns = ["SCFt_MMFF", "SCFt_R-DSM"]

stats = pd.concat([stats1, stats2, stats3], axis=1)
print(stats)
stats.to_csv("optimization_cost_stats_case2.csv")

Number of Function Call (NFC)
Number of SCF Cycles (NSC)
SCF time (SCFt)
       NFC_MMFF  NFC_R-DSM  NSC_MMFF  NSC_R-DSM  SCFt_MMFF  SCFt_R-DSM
count     895.0      885.0     895.0      885.0      895.0       885.0
mean        9.0        4.1      94.4       43.0     1195.1       546.2
std         3.6        2.7      39.7       29.3      545.7       379.9
min         3.0        2.0      31.0       19.0      166.9        90.9
25%         7.0        3.0      66.0       29.0      845.5       353.0
50%         8.0        3.0      85.0       34.0     1100.7       462.3
75%        11.0        4.0     113.0       46.0     1431.0       615.2
max        31.0       31.0     332.0      306.0     4725.6      3998.4
