In [None]:
from validation_utils import get_rmsd_between_two_molecules, read_mol
from pathlib import Path
from tqdm import tqdm
import numpy as np
from matplotlib import pyplot as plt
from rdkit.Chem.Descriptors3D import PBF
from rdkit import Chem

PACHQA_PATH = Path("./data")
structures = list(PACHQA_PATH.glob("**/pubchem_conf.sdf"))

### Calculating RMSD and PBF scores

In [None]:
rmsds_mmff94_xtb2 = []
rmsds_xtb2_r2scan = []
pbf_mmff94 = []
pbf_xtb2 = []
pbf_r2scan = []
for compare in tqdm(structures):
    ref = compare
    mmff94 = compare.parent / "mmff94.xyz"
    r2scan = compare.parent / "r2scan.xyz"
    xtb2 = compare.parent / "xtb2.xtbopt.xyz"
    rmsds_mmff94_xtb2.append(get_rmsd_between_two_molecules(
        mmff94,
        xtb2,
        ref,
        mirror_molecule=False,
    ))
    rmsds_xtb2_r2scan.append(get_rmsd_between_two_molecules(
        xtb2,
        r2scan,
        ref,
        mirror_molecule=False,
    ))
    mol_mmff94 = read_mol(mmff94, ref)
    mol_mmff94 = Chem.RemoveAllHs(mol_mmff94, sanitize=False)
    mol_xtb2 = read_mol(xtb2, ref)
    mol_xtb2 = Chem.RemoveAllHs(mol_xtb2, sanitize=False)
    mol_r2scan = read_mol(r2scan, ref)
    mol_r2scan = Chem.RemoveAllHs(mol_r2scan, sanitize=False)
    pbf_mmff94.append(PBF(mol_mmff94))
    pbf_xtb2.append(PBF(mol_xtb2))
    pbf_r2scan.append(PBF(mol_r2scan))
    

### Plotting the distribution of the symmetry-corrected heavy atom RMSD values between the structures after different steps of geometry optimization

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(rmsds_mmff94_xtb2, bins=np.arange(min(rmsds_xtb2_r2scan), 1.2, 0.01))
plt.savefig("mmff94_xtb2.png")
plt.figure(figsize=(10, 5))
plt.hist(rmsds_xtb2_r2scan, bins=np.arange(min(rmsds_xtb2_r2scan), 1.2, 0.01))
plt.savefig("xtb2_r2scan.png")

### Plotting the distribution of the plane of best fit (PBF) scores after different steps of geometry optimization

In [None]:
plt.figure(figsize=(5, 4))
plt.hist(pbf_mmff94, bins=np.arange(0, 1.75, 0.1))
plt.savefig("pbf_mmff94.png")
plt.figure(figsize=(5, 4))
plt.hist(pbf_xtb2, bins=np.arange(0, 1.75, 0.1))
plt.savefig("pbf_xtb2.png")
plt.figure(figsize=(5, 4))
plt.hist(pbf_r2scan, bins=np.arange(0, 1.75, 0.1))
plt.savefig("pbf_r2scan.png")

### The distribution of PBF scores in subsets after r2SCAN-3c geometry optimization

In [None]:
pbf_r2scan_by_subset = {}

for compare in tqdm(structures):
    subset = compare.parent.parent
    ref = compare
    r2scan = compare.parent / "r2scan.xyz"
    mol_r2scan = read_mol(r2scan, ref)
    mol_r2scan = Chem.RemoveAllHs(mol_r2scan, sanitize=False)
    pbf_r2scan = PBF(mol_r2scan)
    pbf_r2scan_by_subset[subset] = pbf_r2scan_by_subset.get(subset, [])
    pbf_r2scan_by_subset[subset].append(pbf_r2scan)

In [None]:
for subset in pbf_r2scan_by_subset:
    plt.figure(figsize=(5, 4))
    plt.hist(pbf_r2scan_by_subset[subset], bins=np.arange(min(pbf_r2scan_by_subset[subset]), 1.75, 0.1))
    plt.title(subset.name)
    if subset.name == "perCl":
        plt.yticks([0, 5, 10, 15, 20])
    plt.savefig(f"pbf_r2scan_{subset.name}.png")