In this notebook, we will generate datasets using DFT substrate descriptors for all the Kolmogorov-Smirnov-unique reaction conditions.

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..','..','..','..')))
from scripts import ScopeBO
import pandas as pd
import numpy as np
import seaborn as sns
from rdkit import Chem
import matplotlib.pyplot as plt
import matplotlib as mpl
import umap
from sklearn.preprocessing import scale
import random

# General plt parameters
plt.rcParams.update({
    "axes.titlesize": 20,        # Subplot title
    "axes.labelsize": 16,        # X and Y labels
    "figure.titlesize": 24,      # Suptitle
    "xtick.labelsize": 14,       # X tick labels
    "ytick.labelsize": 14,       # Y tick labels
    "legend.fontsize": 14,       # Legend text
    "legend.title_fontsize": 14  # Legend titles
})

## We generate datasets using different descriptors (DFT, Morfeus, Rdkit, Mordred). Let's first start with the DFT descriptors.

Import the descriptors for both substrate classes and generate the fully combinatorial scope

In [2]:
df_combinatorial = ScopeBO.create_reaction_space(
    reactants={"Data_For_Individual_Substrates/amide_dft_descr_amines.csv":"amine",
               "Data_For_Individual_Substrates/amide_dft_descr_acids.csv":"acid"}, 
               feature_processing=False,
               filename="amide_fully_combinatorial_reaction_space.csv")
df_combinatorial

Generation of reaction space completed!


Unnamed: 0,amine_E,amine_ES_root_dipole,amine_ES_root_electronic_spatial_extent,amine_ES_root_molar_volume,amine_E_scf,amine_E_thermal_correction,amine_E_zpe,amine_G,amine_G_thermal_correction,amine_H,...,acid_O2_ES_root_NPA_valence,acid_O2_Mulliken_charge,acid_O2_NMR_anisotropy,acid_O2_NMR_shift,acid_O2_NPA_Rydberg,acid_O2_NPA_charge,acid_O2_NPA_core,acid_O2_NPA_total,acid_O2_NPA_valence,acid_O2_VBur
C#CC(C)(C)N.CC(C(=O)O)c1ccc(-c2ccccc2)c(F)c1,-250.099329,1.6240,605.4829,756.437,-250.235574,0.137539,-250.106642,-250.136756,0.100111,-250.098385,...,6.694903,-0.194704,172.571034,146.000636,0.010347,-0.703152,1.999672,8.703152,6.693127,0.400812
C#CC(C)(C)N.CC(C(=O)O)c1ccc(CBr)cc1,-250.099329,1.6240,605.4829,756.437,-250.235574,0.137539,-250.106642,-250.136756,0.100111,-250.098385,...,6.691840,-0.193912,174.585581,145.259768,0.010297,-0.701242,1.999671,8.701242,6.691268,0.392830
C#CC(C)(C)N.CC(C(=O)O)c1ccc(CC2CCCC2=O)cc1,-250.099329,1.6240,605.4829,756.437,-250.235574,0.137539,-250.106642,-250.136756,0.100111,-250.098385,...,6.691327,-0.194431,175.233783,145.282640,0.010280,-0.701339,1.999671,8.701339,6.691385,0.391862
C#CC(C)(C)N.CC(C)(C(=O)O)c1ccccc1,-250.099329,1.6240,605.4829,756.437,-250.235574,0.137539,-250.106642,-250.136756,0.100111,-250.098385,...,6.694158,-0.190902,164.611372,148.722432,0.010322,-0.704410,1.999671,8.704410,6.694416,0.439691
C#CC(C)(C)N.CC(C)(C)OC(=O)N1CC2(CC2)CC1C(=O)O,-250.099329,1.6240,605.4829,756.437,-250.235574,0.137539,-250.106642,-250.136756,0.100111,-250.098385,...,6.717655,-0.195209,169.439075,150.302632,0.010466,-0.706136,1.999677,8.706136,6.695997,0.450556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Nc1nccc(Cl)n1.O=C(O)c1ccco1,-778.633755,3.4408,1118.1743,843.856,-778.718761,0.091551,-778.640527,-778.672002,0.053304,-778.632811,...,6.612959,-0.195277,164.221845,159.385032,0.010084,-0.698188,1.999700,8.698188,6.688405,0.372388
Nc1nccc(Cl)n1.O=C(O)c1ccnc(C(F)(F)F)c1,-778.633755,3.4408,1118.1743,843.856,-778.718761,0.091551,-778.640527,-778.672002,0.053304,-778.632811,...,6.707898,-0.188382,161.610093,156.612592,0.009685,-0.699302,1.999690,8.699302,6.689922,0.387680
Nc1nccc(Cl)n1.O=C(O)c1ccncc1Cl,-778.633755,3.4408,1118.1743,843.856,-778.718761,0.091551,-778.640527,-778.672002,0.053304,-778.632811,...,6.700756,-0.178875,156.291415,148.425773,0.009916,-0.693026,1.999687,8.693026,6.683423,0.410526
Nc1nccc(Cl)n1.O=C(O)c1cnccn1,-778.633755,3.4408,1118.1743,843.856,-778.718761,0.091551,-778.640527,-778.672002,0.053304,-778.632811,...,6.683605,-0.167554,152.879709,152.776629,0.010193,-0.680458,1.999683,8.680458,6.670572,0.373191


Load the list of the KS-unique conditions.

In [3]:
cond_list = pd.read_csv("./../KS-unique_Datasets/list_of_KS-unique_conds.csv",names=["conds"])["conds"].to_list()
cond_list

[3, 5, 6, 9, 10, 12, 13, 20, 24, 26, 28, 30, 49, 63, 64, 87, 92]

Load the experimental dataset

In [16]:
# load the full experimental dataset
data = pd.read_csv("./../../amide_proc_experimental_data.csv",index_col=0, header=0)

# some reactions were carried out multiple times - average these values
match_columns = ["condition_id", "sub_1_smiles", "sub_2_smiles", "product_smiles",
                 "Activation_ID", "Additive_ID", "Base_ID", "solvent_id", "condition_SMILES"]
data = data.groupby(match_columns,as_index=False)["yield"].mean()

# add a column for the substrate combination
data["Combination_Smiles"] = data["sub_1_smiles"] + "." + data["sub_2_smiles"]

print(f"The experimental dataset has {len(data)} unique experiments and "\
      f"{len(data['product_smiles'].unique())} different products.")

data.head(3)

The experimental dataset has 46713 unique experiments and 632 different products.


Unnamed: 0,condition_id,sub_1_smiles,sub_2_smiles,product_smiles,Activation_ID,Additive_ID,Base_ID,solvent_id,condition_SMILES,yield,Combination_Smiles
0,1,COc1ccc(N)cn1,Cc1cc(C(=O)O)cc(Cl)n1,COc1ccc(NC(=O)c2cc(C)nc(Cl)c2)cn1,R1,A1,B7,S1,.CCN(C(C)C)C(C)C.CN(C)C=O,0.001679,COc1ccc(N)cn1.Cc1cc(C(=O)O)cc(Cl)n1
1,2,C#CC(C)(C)N,CC(C)(C(=O)O)c1ccccc1,C#CC(C)(C)NC(=O)C(C)(C)c1ccccc1,R13,A1,B7,S1,CN(C)C(=[N+](C)C)F.F[P-](F)(F)(F)(F)F.CCN(C(C)...,0.008784,C#CC(C)(C)N.CC(C)(C(=O)O)c1ccccc1
2,2,C#CC(C)(C)N,CCCCOc1ccc(C(=O)O)cc1,C#CC(C)(C)NC(=O)c1ccc(OCCCC)cc1,R13,A1,B7,S1,CN(C)C(=[N+](C)C)F.F[P-](F)(F)(F)(F)F.CCN(C(C)...,0.003263,C#CC(C)(C)N.CCCCOc1ccc(C(=O)O)cc1


In [25]:
for id in cond_list:
    print(f"Working on condition {id}.")

    # get the substrate combinations used by this condition
    cond_data  = data.loc[data["condition_id"] == id]
    smiles_list = cond_data["Combination_Smiles"].to_list()

    # Filter the reaction space to the experimentally evaluated reactions
    df_filtered = df_combinatorial.loc[
        df_combinatorial.index.isin(smiles_list)]
    print(f"There are {len(df_filtered)} reactions in this dataset.")
    
    # create_reaction_space() (function for preprocessing) needs a csv as input - needs to be generated
    df_filtered.to_csv("preprocessing_dummy.csv", index = True, header = True)
    # run the function to preprocess
    df_processed = ScopeBO().create_reaction_space(reactants = ["preprocessing_dummy.csv"],
                                                feature_processing = True,
                                                filename = "preprocessing_dummy.csv")
    print(f"There are {len(df_processed.columns)} features in the processed dataset.")

    # assign the yields
    cond_data.set_index("Combination_Smiles",inplace=True)
    df_processed["yield"] = df_processed.index.map(cond_data["yield"])

    # safe-check for samples with unassigned yields
    if df_processed["yield"].isna().any():
        print("WARNING: There are samples without an assigned yield!")


    df_processed.to_csv(f"./../KS-unique_Datasets/dset_cond{id}.csv",index=True,header=True)
    print(f"Generated the dataset for condition {id}.\n")

Working on condition 3.
There are 520 reactions in this dataset.
Now doing feature preprocessing.
The following features were removed: ['amine_charge', 'acid_charge', 'amine_E_scf', 'amine_E_zpe', 'amine_G', 'amine_G_thermal_correction', 'amine_H', 'amine_H_thermal_correction', 'amine_electronic_spatial_extent', 'amine_number_of_atoms', 'amine_zero_point_correction', 'amine_N1_ES_root_NPA_total', 'amine_N1_ES_root_NPA_valence', 'amine_N1_NPA_total', 'amine_N1_NPA_valence', 'acid_E_scf', 'acid_E_zpe', 'acid_G', 'acid_G_thermal_correction', 'acid_H', 'acid_H_thermal_correction', 'acid_electronic_spatial_extent', 'acid_number_of_atoms', 'acid_zero_point_correction', 'acid_C1_ES_root_NPA_total', 'acid_C1_ES_root_NPA_valence', 'acid_C1_Mulliken_charge', 'acid_C1_NPA_charge', 'acid_C1_NPA_core', 'acid_C1_NPA_total', 'acid_C1_NPA_valence', 'acid_C2_ES_root_NPA_total', 'acid_C2_ES_root_NPA_valence', 'acid_C2_NPA_core', 'acid_C2_NPA_total', 'acid_C2_NPA_valence', 'acid_O1_APT_charge', 'acid_O1_