In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
COMPARE_SETS = (
    "tmp_compas",
    "tmp_qmugs",
)

TMP_DIR = Path("tmp_compare").absolute()

COMPARE_PAIRS = {
    "QMugs": {
        "single_point_energy": {
            "x": {"column": "E", "multiplier": 1},
            "y": {"column": "GFN2_TOTAL_ENERGY", "multiplier": 1},
        },
        "total_enthalpy": {
            "x": {"column": "H", "multiplier": 1},
            "y": {"column": "GFN2_TOTAL_ENTHALPY", "multiplier": 1},
        },
        "total_entropy": {
            "x": {"column": "S", "multiplier": 1},
            "y": {
                "column": "GFN2_ENTROPY_TOT",
                "multiplier": 1 / 627509.474,
            },
        },
        "total_gibbs": {
            "x": {"column": "G", "multiplier": 1},
            "y": {"column": "GFN2_TOTAL_FREE_ENERGY", "multiplier": 1},
        },
        "homo": {
            "x": {"column": "homo", "multiplier": 1},
            "y": {
                "column": "GFN2_HOMO_ENERGY",
                "multiplier": 27.2113961317875,
            },
        },
        "lumo": {
            "x": {"column": "lumo", "multiplier": 1},
            "y": {
                "column": "GFN2_LUMO_ENERGY",
                "multiplier": 27.2113961317875,
            },
        },
        "homo_lumo_gap": {
            "x": {"column": "gap", "multiplier": 1},
            "y": {
                "column": "GFN2_HOMO_LUMO_GAP",
                "multiplier": 27.2113961317875,
            },
        },
        "total_dipole": {
            "x": {"column": "Mu", "multiplier": 1},
            "y": {"column": "GFN2_DIPOLE_TOT", "multiplier": 1},
        },
        "polarizability": {
            "x": {"column": "alpha", "multiplier": 1},
            "y": {"column": "GFN2_POLARIZABILITY_MOLECULAR", "multiplier": 1},
        },
    },
    "COMPAS-3X": {
        "single_point_energy": {
            "x": {"column": "E", "multiplier": 1},
            "y": {
                "column": "Etot_eV",
                "multiplier": 1 / 27.2113961317875,
            },
        },
        "homo": {
            "x": {"column": "homo", "multiplier": 1},
            "y": {"column": "HOMO_eV", "multiplier": 1},
        },
        "lumo": {
            "x": {"column": "lumo", "multiplier": 1},
            "y": {"column": "LUMO_eV", "multiplier": 1},
        },
        "homo_lumo_gap": {
            "x": {"column": "gap", "multiplier": 1},
            "y": {"column": "GAP_eV", "multiplier": 1},
        },
        "total_dipole": {
            "x": {"column": "Mu", "multiplier": 1},
            "y": {"column": "Dipmom_Debye", "multiplier": 1},
        },
        "zero_point_energy": {
            "x": {"column": "zpe", "multiplier": 1},
            "y": {
                "column": "ZPE_eV",
                "multiplier": 1 / 27.2113961317875,
            },
        },
    },
    "COMPAS-1X": {
        "single_point_energy": {
            "x": {"column": "E", "multiplier": 1},
            "y": {
                "column": "Etot_eV",
                "multiplier": 1 / 27.2113961317875,
            },
        },
        "homo": {
            "x": {"column": "homo", "multiplier": 1},
            "y": {"column": "HOMO_eV", "multiplier": 1},
        },
        "lumo": {
            "x": {"column": "lumo", "multiplier": 1},
            "y": {"column": "LUMO_eV", "multiplier": 1},
        },
        "homo_lumo_gap": {
            "x": {"column": "gap", "multiplier": 1},
            "y": {"column": "GAP_eV", "multiplier": 1},
        },
        "total_dipole": {
            "x": {"column": "Mu", "multiplier": 1},
            "y": {"column": "Dipmom_Debye", "multiplier": 1},
        },
        "zero_point_energy": {
            "x": {"column": "zpe", "multiplier": 1},
            "y": {"column": "ZPE_eV", "multiplier": 1 / 27.2113961317875},
        },
    },
}

IGNORE_KEYS = {
    "COMPAS-1X": set(
        [
            "SOIXGBGHVSXNKH-UHFFFAOYSA-N",
            "RJXPMUHZQGSTIM-UHFFFAOYSA-N",
            "VEPXTABLWURNHE-UHFFFAOYSA-N",
        ]
    )
}

In [None]:
PUBLIC_NAMES = {
    "compas-1x.csv_matched.csv": "COMPAS-1X",
    "compas-3x.csv_matched.csv": "COMPAS-3X",
    "summary.csv_matched.csv": "QMugs",
}

if not TMP_DIR.exists():
    TMP_DIR.mkdir()

for compare_set in COMPARE_SETS:
    print(f"Processing set {compare_set}")

    set_dir = Path(compare_set).absolute()

    for file in set_dir.glob("*.csv"):
        filename = file.name
        if filename not in PUBLIC_NAMES:
            continue
        filename_public = PUBLIC_NAMES[filename]
        print(f"Processing file {file.name}")
        data = pd.read_csv(file)

        if filename_public in IGNORE_KEYS:
            data = data[~data["key"].isin(IGNORE_KEYS[filename_public])]

        for pair_name in COMPARE_PAIRS[filename_public]:

            pair = COMPARE_PAIRS[filename_public][pair_name]
            x = pair["x"]
            y = pair["y"]
            pair_data = pd.DataFrame(
                {
                    "key": data["key"],
                    x["column"]: data[x["column"]] * x["multiplier"],
                    y["column"]: data[y["column"]] * y["multiplier"],
                }
            )
            pair_data["diff"] = np.abs(pair_data[x["column"]] - pair_data[y["column"]])
            pair_data.to_csv(Path(TMP_DIR, f"{filename}_{pair_name}.csv"), index=False)
            print(f"Saved {filename}_{pair_name}.csv")