In [37]:
import lwreg
from lwreg import standardization_lib
import psycopg2
import numpy as np

import yaml
solvent_dict = yaml.load(open("../Simulation/solvents.yml"), Loader=yaml.FullLoader)[
    "solvent_mapping_dict"
]
from rdkit import Chem

In [38]:
with open("/home/kpaul/.lwreg_key","r") as f:
    key = f.read().strip()

config = lwreg.utils.defaultConfig()
config["standardization"] = standardization_lib.NoStandardization()
config["dbname"] = "solvent_forces"
config["dbtype"] = "postgresql"
config["removeHs"] = 0
config["registerConformers"] = True
config["hashConformer"] = 0 # set to 0
config["numConformerDigits"] = 3 # Question: what is this?
config["host"] = "scotland"
config["user"] = "kpaul_lwreg"
config["password"] = key

### Get all external test set molecules from water

In [39]:
solvent = "tip3p"
cn = lwreg.utils.connect(config) # Connection to the database
curs = cn.cursor() # Command line cursor in postgresql
curs.execute("select molregno from public.conformers inner join solvent_%s.explicit_calculations on public.conformers.conf_id=solvent_%s.explicit_calculations.conf_id where solvent_%s.explicit_calculations.usage_flag='test'" % (solvent,solvent,solvent))
df = curs.fetchall()
cn.commit()

testmolregnos = [m[0] for m in df]
testmolregnos = np.sort(np.unique(np.array(testmolregnos)))
testmolregnos.shape

(1000,)

### get results for other solvent molecules

In [53]:
solvent_mol_regno_dict = {}

for solvent in solvent_dict.keys():
    cn = lwreg.utils.connect(config) # Connection to the database
    curs = cn.cursor() # Command line cursor in postgresql
    curs.execute("select molregno from public.conformers inner join solvent_%s.explicit_calculations on public.conformers.conf_id=solvent_%s.explicit_calculations.conf_id where solvent_%s.explicit_calculations.usage_flag='test'" % (solvent,solvent,solvent))
    df = curs.fetchall()
    cn.commit()

    testmolregnos = [m[0] for m in df]
    testmolregnos = np.sort(np.unique(np.array(testmolregnos)))
    solvent_mol_regno_dict[solvent] = testmolregnos

#### Get molregnos not in water test set

In [54]:
solvent_missing_molregno_dict = {}
set_water = set(solvent_mol_regno_dict["tip3p"])
for solvent in solvent_dict.keys():
    set_solvent = set(solvent_mol_regno_dict[solvent])
    solvent_missing_molregno_dict[solvent] = list(set_water - set_solvent)

In [55]:
for solvent in solvent_dict.keys():
    print(solvent, len(solvent_missing_molregno_dict[solvent]))

tip3p 0
Chloroform 0
Methanol 0
DMSO 0
DMPU 0
Diethylether 0
Ethanol 0
DMF 0
DCM 0
Toluol 0
Benzol 0
Hexan 0
acetonitrile 0
acetone 0
aceticacid 0
14dioxane 0
nitrobenzol 0
HMPA 0
MTBE 0
IPA 0
Hexafluorobenzene 0
pyridine 0
THF 0
Ethylacetate 0
Sulfolane 0
nitromethane 0
Butylformate 0
NMP 0
Octanol 0
cyclohexane 0
glycerin 0
carbontetrachloride 0
DME 0
2Nitropropane 0
Trifluorotoluene 0
hexafluroacetone 0
Propionitrile 0
Benzonitrile 0
oxylol 0


In [43]:
import psycopg2

#### Get smiles for missing molregnos

In [44]:
missing_smiles = {}
for solvent in solvent_dict.keys():
    missing_ids = solvent_missing_molregno_dict[solvent]
    if missing_ids == []:
        continue
    cn = psycopg2.connect(
        database=config.get("dbname", None),
        host=config.get("host", None),
        user=config.get("user", None),
        password=config.get("password", None),
    )
    curs = cn.cursor() # Command line cursor in postgresql
    curs.execute("select canonical_smiles from hashes where molregno in %s",
                (
                    tuple(
                        [int(id) for id in missing_ids]
                    ),
                ),)
    df = curs.fetchall()
    cn.commit()

    missing_smiles[solvent] = [m[0] for m in df]

    np.save("../Simulation/generation_smiles/missing_testset_smiles/missing_smiles_r2_%s.npy" % solvent, np.array(missing_smiles[solvent],dtype=object))

#### Write input for Euler calculations

In [10]:
submission_string = "# Setup additional external test set\n"
for s,solvent in enumerate(missing_smiles.keys()):
    num_missing_smiles = len(missing_smiles[solvent])
    if num_missing_smiles == 0:
        continue
    num_jobs = num_missing_smiles // 50
    solvent_smiles = solvent_dict[solvent]["SMILES"]
    solvent_random_seed = 161311 + s

    submission_string += f"""
mkdir -p /cluster/project/igc/kpaul/Additional_external_test_set/{solvent}_starting_coordinates
mkdir -p additional_external_caches/CACHE_{solvent}_external_test_set/
job{s}=$(sbatch --parsable --array=0-{num_jobs} -n 4 --time=24:00:00 --tmp=1000 --mem-per-cpu=4000 -o Calculated_data/slurm_log/run_generation_external_test_additional_{solvent}_starting_%A_%a.out --wrap='python run_training_set_generation.py -i $SLURM_ARRAY_TASK_ID -n 50 -s "{solvent_smiles}" -r {solvent_random_seed} -f generation_smiles/missing_testset_smiles/missing_smiles_{solvent}.npy --startonly --starttrajloc /cluster/project/igc/kpaul/Additional_external_test_set/{solvent}_starting_coordinates/ --cachelocation additional_external_caches/CACHE_{solvent}_external_test_set/ --oneonly')
sbatch --dependency=afterany:$job{s} --array=0-{num_jobs} -n 4 --time=24:00:00 --tmp=5000 --mem-per-cpu=4000 --gpus=1 -o Calculated_data/Externalt_test_set_additional_{solvent}/slurm_log/run_generation_{solvent}_%A_%a.out --wrap='python run_training_set_generation.py -i $SLURM_ARRAY_TASK_ID -n 50 -s "{solvent_smiles}" -r {solvent_random_seed} -f generation_smiles/canonical_smiles.npy -nf 1 --starttrajloc /cluster/project/igc/kpaul/Additional_external_test_set/{solvent}_starting_coordinates/ -sl Calculated_data/Externalt_test_set_additional_{solvent}/ --cachelocation additional_external_caches/CACHE_{solvent}_external_test_set/ --oneonly'
"""


In [45]:
submission_string = "# Setup additional external test set\n"
for s,solvent in enumerate(missing_smiles.keys()):
    num_missing_smiles = len(missing_smiles[solvent])
    if num_missing_smiles == 0:
        continue
    num_jobs = num_missing_smiles // 50
    solvent_smiles = solvent_dict[solvent]["SMILES"]
    solvent_random_seed = 161311 + s

    submission_string += f"""
mkdir -p /cluster/project/igc/kpaul/Additional_external_test_set_r2/{solvent}_starting_coordinates
mkdir -p additional_external_caches/CACHE_{solvent}_external_test_set_r2/
mkdir -p Calculated_data/Externalt_test_set_additional_r2_{solvent}/
job{s}=$(sbatch --parsable --array=0-{num_jobs} -n 4 --time=24:00:00 --tmp=1000 --mem-per-cpu=4000 -o Calculated_data/slurm_log/run_generation_external_test_additional_r2_{solvent}_starting_%A_%a.out --wrap='python run_training_set_generation.py -i $SLURM_ARRAY_TASK_ID -n 50 -s "{solvent_smiles}" -r {solvent_random_seed} -f generation_smiles/missing_testset_smiles/missing_smiles_r2_{solvent}.npy --startonly --starttrajloc /cluster/project/igc/kpaul/Additional_external_test_set_r2/{solvent}_starting_coordinates/ --cachelocation additional_external_caches/CACHE_{solvent}_external_test_set_r2/ --oneonly')
sbatch --dependency=afterany:$job{s} --array=0-{num_jobs} -n 4 --time=24:00:00 --tmp=5000 --mem-per-cpu=4000 --gpus=1 -o Calculated_data/Externalt_test_set_additional_r2_{solvent}/slurm_log/run_generation_{solvent}_%A_%a.out --wrap='python run_training_set_generation.py -i $SLURM_ARRAY_TASK_ID -n 50 -s "{solvent_smiles}" -r {solvent_random_seed} -f generation_smiles/missing_testset_smiles/missing_smiles_r2_{solvent}.npy -nf 1 --starttrajloc /cluster/project/igc/kpaul/Additional_external_test_set_r2/{solvent}_starting_coordinates/ -sl Calculated_data/Externalt_test_set_additional_r2_{solvent}/ --cachelocation additional_external_caches/CACHE_{solvent}_external_test_set_r2/ --oneonly'
"""

#### Deposit results in Database

In [30]:
submission_string = "# Setup deposition external test set\n"
for s,solvent in enumerate(missing_smiles.keys()):
    num_missing_smiles = len(missing_smiles[solvent])
    num_jobs = num_missing_smiles // 50
    solvent_smiles = solvent_dict[solvent]["SMILES"]
    solvent_random_seed = 161311 + s

    submission_string += f"""python run_deposit_database.py -s 0 -e {num_jobs+1} -t test -h5 "Calculated_data/Externalt_test_set_additional_{solvent}/{solvent_smiles}_small_molecules_n_50_id_XXX_seed_{solvent_random_seed}.hdf5" -nc 1 -d solvent_{solvent.lower()}
"""

In [48]:
submission_string = "# Setup deposition external test set\n"
for s,solvent in enumerate(missing_smiles.keys()):
    num_missing_smiles = len(missing_smiles[solvent])
    num_jobs = num_missing_smiles // 50
    solvent_smiles = solvent_dict[solvent]["SMILES"]
    solvent_random_seed = 161311 + s

    submission_string += f"""python run_deposit_database.py -s 0 -e {num_jobs+1} -t test -h5 "Calculated_data/Externalt_test_set_additional_r2_{solvent}/{solvent_smiles}_small_molecules_n_50_id_XXX_seed_{solvent_random_seed}.hdf5" -nc 1 -d solvent_{solvent.lower()}
"""

In [33]:
with open("../Simulation/run_external_test_set_database.sh","w") as f:
    f.write(submission_string)

In [202]:
np.load("../Simulation/generation_smiles/missing_testset_smiles/missing_smiles_Methanol.npy").shape

(38,)