In [1]:
import os
import shutil
from pathlib import Path

import pandas as pd

from rdkit import Chem
from utils import get_inchikey_from_smiles
from utils import get_pachqa_structures
from utils import get_rmsd_between_two_molecules

In [2]:
# path to https://gitlab.com/porannegroup/compas
COMPAS_SOURCE_FOLDER = Path("compas").absolute()

# path to folder with props.csv
PACHQA_SOURCE_FOLDER = Path("../property_parsing/").absolute()

# path to PACHQA structures
PACHQA_STRUCTURES_SOURCE_FOLDER = Path("data").absolute()

# output files will be writen here
TMP_DIR = Path("tmp_compas/").absolute()

In [3]:
if not TMP_DIR.exists():
    os.makedirs(TMP_DIR)

## reading PACHQA data

In [4]:
pachqa_data = pd.read_csv(Path(PACHQA_SOURCE_FOLDER, "props.csv"))
pachqa_inchikeys = set(pachqa_data.key)

## reading compas and merge it by inchikey

In [5]:
PUBLIC_NAMES = {
    "compas-1x.csv": "COMPAS-1X",
    "compas-2x.csv": "COMPAS-2X",
    "compas-3x.csv": "COMPAS-3X",
}

In [None]:
for filepath in COMPAS_SOURCE_FOLDER.glob("**/*.csv"):
    if filepath.name not in PUBLIC_NAMES:
        continue
    print(f"Starting to merge {filepath.name} with PACHQA results")
    compas_data = pd.read_csv(filepath)
    compas_inchikeys = []
    for smiles in compas_data.smiles:
        inchikey = get_inchikey_from_smiles(smiles)
        compas_inchikeys.append(inchikey)
    compas_data["key"] = compas_inchikeys
    matched_data = pd.merge(pachqa_data, compas_data, on="key", suffixes=("_pachqa", "_compas"))
    print(f"It is {matched_data.shape[0]} matched rows by inchikey and {len(set(matched_data.key))} unique inchikeys")
    matched_data.to_csv(Path(TMP_DIR, filepath.name + "_matched.csv"), index=False)

## calculate rmsd between pairs

### extract structures
First, you need to manually unpack the corresponding archives and rename the folder with structures to the csv dataset name with the suffix `_structures`.

In case COMPAS-2X, you need to unpack `compas-2x.sdf` and save this file as `compas-2x.csv_structures`

In [None]:
KEY_STRUCTURE_COLUMN_MAPPED = {
    "compas-1x.csv": "molecule",
    "compas-2x.csv": "name",
    "compas-3x.csv": "molecule",
}

KEY_UNPACK_FORMAT_MAPPED = {
    "compas-1x.csv": "folder",
    "compas-2x.csv": "sdf_file",
    "compas-3x.csv": "folder",
}

SUPPORTED_TYPES = ("xyz", "sdf", )

for filepath in COMPAS_SOURCE_FOLDER.glob("**/*.csv"):
    if filepath.name not in PUBLIC_NAMES:
        continue
    matched_data = pd.read_csv(Path(TMP_DIR, filepath.name + "_matched.csv"))
    name = KEY_STRUCTURE_COLUMN_MAPPED[filepath.name]
    base_path = filepath.parent
    structures_path = Path(base_path, filepath.name + "_structures")
    assert name in matched_data.columns

    extract_dest = Path(TMP_DIR, filepath.name + "_structures")
    if any(matched_data[name]) and not extract_dest.exists():
        extract_dest.mkdir()
    
    unpack_format = KEY_UNPACK_FORMAT_MAPPED[filepath.name]
    if unpack_format == "folder":
        for structure_name, structure_inchikey in matched_data[[name, "key"]].values:
            for structure_type in SUPPORTED_TYPES:
                structure_path = Path(structures_path, structure_name + "." + structure_type)
                if structure_path.exists():
                    shutil.copyfile(structure_path, Path(extract_dest, structure_inchikey + "." + structure_type))
                    break
            else:
                exception = Exception("Cant find structure.")
                exception.add_note(f"Dataset path: {filepath}")
                exception.add_note(f"Structure name: {structure_name}")
                raise exception
    elif unpack_format == "sdf_file":
        mol_name_inchikey_mapped = {}
        for structure_name, structure_inchikey in matched_data[[name, "key"]].values:
            mol_name_inchikey_mapped[structure_name] = structure_inchikey
        reader = Chem.SDMolSupplier(structures_path)
        for mol in reader:
            mol_name = mol.GetProp("name")
            if mol_name not in mol_name_inchikey_mapped:
                continue
            structure_inchikey = mol_name_inchikey_mapped[mol_name]
            with Chem.SDWriter(Path(extract_dest, structure_inchikey + ".sdf")) as writer:
                writer.write(mol)

                


### calculate rmsd

In [None]:
INCHIKEY_STRUCTURES_MAPPED = get_pachqa_structures(PACHQA_STRUCTURES_SOURCE_FOLDER)
SUPPORTED_TYPES = ("xyz", "sdf", )

for filepath in TMP_DIR.glob("*_matched.csv"):
    matched_data = pd.read_csv(filepath)
    if filepath.name.count("_matched.csv") > 1:
        raise Exception("Something wrong. There are more than one `_matched.csv` fragments in filepath.")
    compas_structures_path = Path(filepath.parent, filepath.name.replace("_matched.csv", "") + "_structures")
    dump_pairs_path = Path(filepath.parent, filepath.name.replace("_matched.csv", "") + "_pairs")

    if not dump_pairs_path.exists():
        dump_pairs_path.mkdir()

    rmsd_xtb2 = []
    rmsd_r2scan = []
    rmsd_mmff94 = []

    for inchikey in matched_data.key:
        pachqa_structures_path = INCHIKEY_STRUCTURES_MAPPED[inchikey]
        for compas_strucure_type in SUPPORTED_TYPES:
            compas_structure_path = Path(compas_structures_path, inchikey + "." + compas_strucure_type)
            if compas_structure_path.exists():
                break
        else:
            raise Exception(f"There are no structure for inchikey {inchikey} in {filepath.name} data")

        pachqa_mol_path_sdf_ref = Path(pachqa_structures_path, "pubchem_conf.sdf")
        
        pachqa_mol_path_xyz_xtb2 = Path(pachqa_structures_path, "xtb2.xtbopt.xyz")
        rmsd_xtb2.append(
            get_rmsd_between_two_molecules(
                file1=pachqa_mol_path_xyz_xtb2,
                file2=compas_structure_path,
                ref=pachqa_mol_path_sdf_ref,
                dump=True,
                dump_name=Path(dump_pairs_path, "xtb2_" + inchikey)
            )
        )
        
        pachqa_mol_path_xyz_r2scan = Path(pachqa_structures_path, "r2scan.xyz")
        rmsd_r2scan.append(
            get_rmsd_between_two_molecules(
                file1=pachqa_mol_path_xyz_r2scan,
                file2=compas_structure_path,
                ref=pachqa_mol_path_sdf_ref,
                dump=True,
                dump_name=Path(dump_pairs_path, "r2scan_" + inchikey)
            )
        )
        
        pachqa_mol_path_xyz_mmff94 = Path(pachqa_structures_path, "mmff94.xyz")
        rmsd_mmff94.append(
            get_rmsd_between_two_molecules(
                file1=pachqa_mol_path_xyz_mmff94,
                file2=compas_structure_path,
                ref=pachqa_mol_path_sdf_ref,
                dump=True,
                dump_name=Path(dump_pairs_path, "mmff94_" + inchikey)
            )
        )

    matched_data["rmsd_xtb2"] = rmsd_xtb2
    matched_data["rmsd_r2scan"] = rmsd_r2scan
    matched_data["rmsd_mmff94"] = rmsd_mmff94
    matched_data.to_csv(Path(filepath.parent, filepath.name + "_rmsd.csv"), index=False)

