### Use the published autochem workflow to generate DFT features for the compounds in the dataset

NOTE: This notebook needs to be run with the python environment for autoqchem.

In [1]:
from autoqchem.molecule import molecule
from autoqchem.sge_manager import sge_manager
from autoqchem.draw_utils import draw
from autoqchem.db_functions import descriptors
from rdkit import Chem
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)

### Load the product smiles strings and create input files for the DFT calculations

In [2]:
# Load the previously saved list of reaction products
df_products = pd.read_csv("./amide_smiles_products.csv",header=0)
df_products

Unnamed: 0,0
0,Cc1ccc(NC(=O)c2cc3ccccc3s2)nc1
1,O=C(NCc1ccc(F)cc1F)c1ccco1
2,O=C(NCc1ccc(Cl)cc1)c1cccc(-c2ccccc2)c1
3,COc1ccc(CNC(=O)C(C)c2ccc(-c3ccccc3)c(F)c2)cc1
4,COc1ccc(NC(=O)c2cccc(-c3ccccc3)c2)cn1
...,...
627,COc1ccc(NC(=O)C2CCN(C(=O)OCc3ccccc3)CC2)cn1
628,CON(C)C(=O)c1c(C)cc(C)cc1C
629,Cc1ccc(S(=O)(=O)NC(=O)c2ccc3nc(C)ccc3c2)cc1
630,Cc1ccc(Cl)c(NC(=O)c2ccc3nccnc3c2)c1


In [4]:
product_smiles = df_products.iloc[:,0]

In [18]:
# generate molecule objects with up to 8 conformers for each structure
mols = [molecule(s, num_conf=8) for s in product_smiles]

In [11]:
# check the compounds by drawing them
draw(mols[0].mol)

interactive(children=(Dropdown(description='confId', options=(0, 1, 2, 3, 4, 5, 6, 7), value=0), Output()), _d…

<function autoqchem.draw_utils._graph_conf(m, confId=0, energies=[])>

In [None]:
# connect to UCLA's computation cluster
sm = sge_manager(user='XXXXXX', host='hoffman2.idre.ucla.edu')
sm.connect()

In [13]:
# create Gaussian jobs locally
for mol in mols:
    sm.create_jobs_for_molecule(mol, theory="APFD",heavy_basis_set="def2tzvp",light_basis_set='def2svp',max_light_atomic_number=10)

INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 8 conformations.


### Manage the DFT jobs on the cluster

In [14]:
# Submit jobs
sm.submit_jobs()

INFO:autoqchem.sge_manager:Submitting 8 jobs.
  0%|          | 0/8 [00:00<?, ?it/s]INFO:paramiko.transport.sftp:[chan 1] Opened sftp connection (server version 3)
INFO:autoqchem.sge_manager:Submitted job fc4f274ba8a00b2d9796b95f66e22126, job_id: 10075402.
 12%|█▎        | 1/8 [00:05<00:36,  5.17s/it]INFO:autoqchem.sge_manager:Submitted job 108e22efb88a852b6f11a6f0d8c5c794, job_id: 10075403.
 25%|██▌       | 2/8 [00:09<00:27,  4.52s/it]INFO:autoqchem.sge_manager:Submitted job be25ad7e2824967b3d365c4186e4b947, job_id: 10075404.
 38%|███▊      | 3/8 [00:11<00:17,  3.54s/it]INFO:autoqchem.sge_manager:Submitted job 500c35b8debce52e958d6cc5cf87ad09, job_id: 10075405.
 50%|█████     | 4/8 [00:15<00:15,  3.76s/it]INFO:autoqchem.sge_manager:Submitted job 09bc36231a2a52d4cc0578ecf17a4aa1, job_id: 10075407.
 62%|██████▎   | 5/8 [00:19<00:11,  3.92s/it]INFO:autoqchem.sge_manager:Submitted job 9cca9983cc6b50a7a77267d1b420bf63, job_id: 10075408.
 75%|███████▌  | 6/8 [00:24<00:08,  4.15s/it]INFO:auto

In [None]:
# Resubmit jobs that did not finish properly
sm.resubmit_incomplete_jobs()

INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\FMQLCKKYZGOAJI-UHFFFAOYSA-N/FMQLCKKYZGOAJI-UHFFFAOYSA-N_conf_1.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\MEYYGMHUQCZVRH-UHFFFAOYSA-N/MEYYGMHUQCZVRH-UHFFFAOYSA-N_conf_1.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\BTGSWNAPQOGQEH-UHFFFAOYSA-N/BTGSWNAPQOGQEH-UHFFFAOYSA-N_conf_5.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\GJNKONHEQWFMHU-UHFFFAOYSA-N/GJNKONHEQWFMHU-UHFFFAOYSA-N_conf_4.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\GRQZKNNMBDLUHY-UHFFFAOYSA-N/GRQZKNNMBDLUHY-UHFFFAOYSA-N_conf_4.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\ZXCXWTFANXRDTI-UHFFFAOYSA-N/ZXCXWTFANXRDTI-UHFFFAOYSA-N_conf_7.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\DJYNRZDVOSWMGJ-OAHLLOKOSA-N/DJYNRZDVOSWMGJ-OAHLLOKOSA-N_conf_5.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\GUEFVTHWWKCQHQ-UHFFFAOYSA-N/GUEFVTHWWKCQHQ-UHFFFAOYSA-N_conf_6.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\BWIBBBDHGLSOEA-UHFFFAOYSA-N/BWIBBBDHGLSOEA-UHFFFAOYSA-N_conf_2.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\BWIBBBDHGLSOEA-UHFFFAOYSA-N/BWIBBBDHGLSOEA-UHFFFAOYSA-N_conf_4.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\MLNIOBYWVOKRQT-UHFFFAOYSA-N/MLNIOBYWVOKRQT-UHFFFAOYSA-N_conf_2.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\CDKCREBPFQDNRR-UHFFFAOYSA-N/CDKCREBPFQDNRR-UHFFFAOYSA-N_conf_0.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\PQPNMKICXDDJNY-UHFFFAOYSA-N/PQPNMKICXDDJNY-UHFFFAOYSA-N_conf_5.log
C:\Users\Sven\AppData\Local\autoqchem\hoffman2\VTZOWXHKFAFIPM-UHFFFAOYSA-N/VTZOWXHKFAFIPM-UHFFFAOYSA-N_conf_3.log


INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00
INFO:autoqchem.sge_manager:Substituting last checked geometry in the new input file.
INFO:autoqchem.sge_manager:Substituting wall_time with new value: 23:59:00


C:\Users\Sven\AppData\Local\autoqchem\hoffman2\YTOUIWNZBHDNPD-LLVKDONJSA-N/YTOUIWNZBHDNPD-LLVKDONJSA-N_conf_3.log


  0%|          | 0/15 [00:00<?, ?it/s]INFO:autoqchem.sge_manager:Submitted job f54a4e3091e90689bb2a9aaf27d46ea6, job_id: 9731478.
  7%|▋         | 1/15 [00:06<01:27,  6.22s/it]INFO:autoqchem.sge_manager:Submitted job 97dd4fb0822697d14a487b6967868d29, job_id: 9731479.
 13%|█▎        | 2/15 [00:11<01:12,  5.57s/it]INFO:autoqchem.sge_manager:Submitted job 45b40cc44d2320fb2eb077ffd24bea3d, job_id: 9731480.
 20%|██        | 3/15 [00:16<01:04,  5.36s/it]INFO:autoqchem.sge_manager:Submitted job aaa0e4c55cb52b3098b8760138a851c9, job_id: 9731481.
 27%|██▋       | 4/15 [00:21<00:57,  5.20s/it]INFO:autoqchem.sge_manager:Submitted job cf2168f724fa68633a1d032bb5582a7c, job_id: 9731482.
 33%|███▎      | 5/15 [00:26<00:51,  5.15s/it]INFO:autoqchem.sge_manager:Submitted job e807b1c36b9670e0f8b023bbd7ce4253, job_id: 9731483.
 40%|████      | 6/15 [00:31<00:46,  5.21s/it]INFO:autoqchem.sge_manager:Submitted job c7385b8e7976ee7b3efbdfe76f1f22f7, job_id: 9731484.
 47%|████▋     | 7/15 [00:37<00:43,  5.39s

In [None]:
# Retrieve finished jobs from the cluster
sm.retrieve_jobs()

### Upload finished calculations to the database

In [5]:
# Upload data for finished compounds to the autoqchem database (autoqchem.org)
sm.upload_done_molecules_to_db(tags=["SVR_Amide_products"])

INFO:autoqchem.sge_manager:There are 1 finished molecules ['O=C(Cc1ccc2c(c1)C(=O)c1ccccc1CO2)NCc1ccc2c(c1)OCO2'].
INFO:autoqchem.sge_manager:Molecule O=C(Cc1ccc2c(c1)C(=O)c1ccccc1CO2)NCc1ccc2c(c1)OCO2 has 1 / 8 duplicate conformers.
INFO:autoqchem.sge_manager:Removing 1 / 8 jobs and log files that contain duplicate conformers.
INFO:autoqchem.sge_manager:Uploaded descriptors to DB for smiles: O=C(Cc1ccc2c(c1)C(=O)c1ccccc1CO2)NCc1ccc2c(c1)OCO2, number of conformers: 7, DB molecule id 688da6ce8a568899ce7cca10.


### Get the desriptors from the autoqchem database

In [27]:
# Download the descriptors
data = descriptors(tags=["SVR_Amide_products"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                   functional="APFD",basis_set="def2svp",substructure="[C,c]C(=O)N")

In [None]:
# # Download the descriptors
# data = descriptors(tags=["SVR_Amide"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
#                    functional="APFD",basis_set="def2svp",substructure="[C,c]C(=O)[OH]")

In [None]:
# # Download the descriptors
# data = descriptors(tags=["SVR_Amide"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
#                    functional="APFD",basis_set="def2svp",substructure="[NH2,NH]")

In [28]:
# Process the data so that it is in one dataframe
label_dict={}
for key in data:
    if key != "global":
        # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
        if data[key].iloc[0,-1] not in label_dict:
            label_dict[data[key].iloc[0,-1]] = 1
        else:
            label_dict[data[key].iloc[0,-1]] += 1
        label = data[key].iloc[0,-1]+str(label_dict[data[key].iloc[0,-1]])
        data[key].drop(columns=["labels","X","Y","Z"],inplace=True)
        data[key].columns = [f"{label}_{column}" for column in data[key].columns]
    else:
        data[key].drop(columns=["converged","multiplicity"],inplace=True)

df_combined = pd.concat(data,axis=1)
df_combined.columns = [multi_column_index[1] for multi_column_index in df_combined.columns]

In [29]:
df_combined

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,N1_ES_root_NPA_valence,N1_Mulliken_charge,N1_NMR_anisotropy,N1_NMR_shift,N1_NPA_Rydberg,N1_NPA_charge,N1_NPA_core,N1_NPA_total,N1_NPA_valence,N1_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C#CC(C)(C)NC(=O)C(C)(C)c1ccccc1,-711.561754,2.102901,4174.842003,2383.995411,-711.881745,0.324263,-711.579248,-711.624665,0.261354,-711.560810,...,5.641119,-0.177225,78.125291,132.260424,0.013847,-0.68061,1.999101,7.68061,5.667662,0.719598
C#CC(C)(C)NC(=O)C(c1ccccc1)c1ccccc1,-863.724156,2.375130,6142.366701,1969.705369,-864.070613,0.351615,-863.743612,-863.793593,0.282179,-863.723212,...,5.646138,-0.192679,84.646935,125.449387,0.01348,-0.684047,1.999112,7.684047,5.671463,0.724371
C#CC(C)(C)NC(=O)C1c2ccccc2Oc2ccccc21,-937.659700,2.611506,6183.457968,2753.009551,-937.987955,0.334363,-937.678718,-937.727443,0.266620,-937.658756,...,5.655536,-0.178379,81.777255,127.478451,0.013358,-0.6787,1.99911,7.6787,5.666231,0.70663
C#CC(C)(C)NC(=O)Cc1cc(F)cc(F)c1,-831.309635,2.024240,4580.225811,2219.735501,-831.555615,0.250529,-831.326176,-831.372680,0.187483,-831.308691,...,5.63276,-0.190796,78.517319,127.391646,0.013103,-0.67981,1.999112,7.67981,5.667587,0.686256
C#CC(C)(C)NC(=O)Cc1ccc(C(F)(F)F)cc1,-969.661184,11.647363,6430.481618,2179.502976,-969.929414,0.273863,-969.679705,-969.730975,0.204072,-969.660240,...,5.603478,-0.190433,76.717854,124.844826,0.012994,-0.680447,1.999116,7.680447,5.668337,0.686546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(Nc1ccc2nccnc2c1)c1ccnc(C(F)(F)F)c1,-1168.797112,6.153097,11901.917071,2009.322357,-1169.024977,0.244781,-1168.814690,-1168.863686,0.178208,-1168.796168,...,5.617142,-0.142481,96.275989,127.194416,0.009581,-0.630209,1.99909,7.630209,5.62154,0.585588
O=C(Nc1ccc2nccnc2c1)c1cnccn1,-848.277531,3.736413,7061.782897,1767.625480,-848.489788,0.224247,-848.291243,-848.333549,0.168228,-848.276587,...,5.619129,-0.131986,99.778871,130.85278,0.009633,-0.632238,1.999074,7.632238,5.623532,0.579745
O=C(Nc1ccc2scnc2c1)C(c1ccccc1)c1ccccc1,-1390.896350,5.807206,11211.332069,2571.102418,-1391.232147,0.342500,-1390.916207,-1390.968598,0.270251,-1390.895405,...,5.556365,-0.142921,96.610865,116.163513,0.009839,-0.651494,1.999054,7.651494,5.642601,0.653673
O=C(Nc1ccncc1[N+](=O)[O-])C1c2ccccc2Oc2ccccc21,-1194.818834,18.700269,8052.524893,2738.996416,-1195.122634,0.314274,-1194.838397,-1194.889760,0.243348,-1194.817890,...,5.662706,-0.138176,142.350227,125.127712,0.009671,-0.665896,1.999,7.665896,5.657224,0.686044


In [None]:
# # Save the dataset
# df_combined.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_dft_descr_amines.csv",index=True,header=True)

In [None]:
# # Save the dataset
# df_combined.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_dft_descr_acids.csv",index=True,header=True)

In [None]:
# Save the dataset
df_combined.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/amide_dft_descr_prods.csv",index=True,header=True)

### The acid and amine substrates were featurized by the same workflow, and the descriptors were saved in the autoqchem dataset "SVR_Amide" (see commented out cells).