In [1]:
import pandas as pd
from rdkit.Chem import AllChem
import numpy as np
import xgboost as xgb

from pymatgen.core import Molecule
from rdkit.Chem import rdChemReactions
from rdkit import Chem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


from skopt.searchcv import BayesSearchCV
from skopt.space import Real, Integer

In [2]:
file_train = "../../dataset/mg_dataset/rapter_clean/train_full_05132023.json"
file_test = "../../dataset/mg_dataset/rapter_clean/test_full_05132023.json"
train_df = pd.read_json(file_train)
test_df = pd.read_json(file_test)

file_train_mg = "../../dataset/mg_dataset/rapter_clean/train_mg_05132023.json"
file_test_mg = "../../dataset/mg_dataset/rapter_clean/test_mg_05132023.json"
train_df_mg = pd.read_json(file_train_mg)
test_df_mg = pd.read_json(file_test_mg)

file_train_hydro = "../../dataset/mg_dataset/rapter_clean/train_hydro_05132023.json"
file_test_hydro = "../../dataset/mg_dataset/rapter_clean/test_hydro_05132023.json"
train_df_hydro = pd.read_json(file_train_hydro)
test_df_hydro = pd.read_json(file_test_hydro)

In [3]:
def get_all_barriers(df): 
    barriers = []
    for i in range(len(df)):
        forward = df.iloc[i]["transition_state_free_energy"] - df.iloc[i]["reactant_free_energy"]
        backward = df.iloc[i]["transition_state_free_energy"] - df.iloc[i]["product_free_energy"]
        if forward > 0 and backward > 0:
            barriers.append(forward)
            barriers.append(backward)
        #barriers.append(forward)
        #barriers.append(backward)
    #print(len(barriers))
    return np.array(barriers)

def get_fingerprints_from_df(df):
    failed = 0
    labels, reaction_fps = [], []
    
    for ind, row in df.iterrows():
        combined_products_graph = row["product_molecule_graph"]
        combined_reactants_graph = row["reactant_molecule_graph"]
        pmg_prod = Molecule.from_dict(combined_products_graph["molecule"])
        pmg_react = Molecule.from_dict(combined_reactants_graph["molecule"])
        
        label = row["transition_state_free_energy"] - row["reactant_free_energy"]
        label_rev = row["transition_state_free_energy"] - row["product_free_energy"]
        reactants_xyz = pmg_react.to(fmt="sdf")
        product_xyz = pmg_prod.to(fmt="sdf")
        mol_reactants = Chem.MolFromMolBlock(
            reactants_xyz, removeHs=False, sanitize=True
        )
        mol_products = Chem.MolFromMolBlock(product_xyz, removeHs=False, sanitize=True)


        array_temp = np.zeros((1, 1024))

        try:
            if type(mol_reactants) != None:
                reactant_fps = [
                    AllChem.GetMorganFingerprintAsBitVect(
                        mol_reactants, 2, nBits=1024
                    )
                ]

            if type(mol_products) != None:
                product_fps = [
                    AllChem.GetMorganFingerprintAsBitVect(
                        mol_products, 2, nBits=1024
                    )
                ]


            for i in reactant_fps:
                array_temp += i
            for i in product_fps:
                array_temp += i

            if label > 0 and label_rev > 0:
                reaction_fps.append(array_temp.reshape(-1))
                reaction_fps.append(array_temp.reshape(-1) * -1)
                labels.append(label)
                labels.append(label_rev)
                
        except:
            failed += 1

    reactant_fps = np.array(reaction_fps)
    labels = np.array(labels)
    print("failed: ", failed)
    return reactant_fps, labels


# Median, Mean Benchmarks


In [4]:
# Mean/ Median Benchmarks
print("\nFull Dataset")
print("Mean/ Median Benchmarks")
barriers_train = get_all_barriers(train_df)
barriers_test = get_all_barriers(test_df)
mean, median = np.mean(barriers_train), np.median(barriers_train)
mean_array = np.full(len(barriers_test), mean)
median_array = np.full(len(barriers_test), median)
print("mean: ", mean, "median: ", median)
print("MAE: ", np.mean(np.abs(barriers_test - mean_array)))
print("MSE: ", np.mean((barriers_test - mean_array)**2))
print("R2: ", 1 - np.sum((barriers_test - mean_array)**2)/np.sum((barriers_test - np.mean(barriers_test))**2))

print("\nMg Dataset")
print("Mean/ Median Benchmarks")
barriers_train_mg = get_all_barriers(train_df_mg)
barriers_test_mg = get_all_barriers(test_df_mg)
mean, median = np.mean(barriers_train_mg), np.median(barriers_train_mg)
mean_array = np.full(len(barriers_test_mg), mean)
median_array = np.full(len(barriers_test_mg), median)
print("mean: ", mean, "median: ", median)
print("MAE: ", np.mean(np.abs(barriers_test_mg - mean_array)))
print("MSE: ", np.mean((barriers_test_mg - mean_array)**2))
print("R2: ", 1 - np.sum((barriers_test_mg - mean_array)**2)/np.sum((barriers_test_mg - np.mean(barriers_test_mg))**2))

print("\nHydro Dataset")
print("Mean/ Median Benchmarks")
barriers_train_hydro = get_all_barriers(train_df_hydro)
barriers_test_hydro = get_all_barriers(test_df_hydro)
mean, median = np.mean(barriers_train_hydro), np.median(barriers_train_hydro)
mean_array = np.full(len(barriers_test_hydro), mean)
median_array = np.full(len(barriers_test_hydro), median)
print("mean: ", mean, "median: ", median)
print("MAE: ", np.mean(np.abs(barriers_test_hydro - mean_array)))
print("MSE: ", np.mean((barriers_test_hydro - mean_array)**2))
print("R2: ", 1 - np.sum((barriers_test_hydro - mean_array)**2)/np.sum((barriers_test_hydro - np.mean(barriers_test_hydro))**2))




Full Dataset
Mean/ Median Benchmarks
mean:  1.233492749389003 median:  0.8435893880996446
MAE:  1.0495153997654096
MSE:  1.5611562296814667
R2:  -3.2349482580018574e-05

Mg Dataset
Mean/ Median Benchmarks
mean:  1.0278281035825187 median:  0.35367518849943735
MAE:  1.0232714121911815
MSE:  1.652331140755717
R2:  -0.00014186589947939865

Hydro Dataset
Mean/ Median Benchmarks
mean:  2.039573829678398 median:  2.0784274534498763
MAE:  0.48992427032593766
MSE:  0.4097343645414167
R2:  -8.011703322585717e-05


# XGBoost


In [5]:
#failed:  341
#failed:  105
#failed:  68
#failed:  21
#failed:  273
#failed:  84
rxn_fps_train, labels_train = get_fingerprints_from_df(train_df)
rxn_fps_test, labels_test = get_fingerprints_from_df(test_df)
rxn_fps_train_mg, labels_train_mg = get_fingerprints_from_df(train_df_mg)
rxn_fps_test_mg, labels_test_mg = get_fingerprints_from_df(test_df_mg)
rxn_fps_train_hydro, labels_train_hydro= get_fingerprints_from_df(train_df_hydro)
rxn_fps_test_hydro, labels_test_hydro = get_fingerprints_from_df(test_df_hydro)

[09:31:34] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:31:34] Explicit valence for atom # 5 N, 4, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:31:34] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:31:34] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:31:34] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:31:34] Explicit valence for atom # 5 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:31:34] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:31:34] Explicit valence for atom # 4 C, 5, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:31:3

failed:  341


[09:31:50] Explicit valence for atom # 6 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:31:50] Explicit valence for atom # 2 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:31:50] Explicit valence for atom # 0 N, 4, is greater than permitted
[09:31:50] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:31:50] Explicit valence for atom # 0 N, 4, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:31:51] Explicit valence for atom # 3 N, 4, is greater than permitted
[09:31:51] Explicit valence for atom # 3 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

*** Open Babel Error  in TetStereoToWedgeHash

failed:  105


[09:31:55] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:31:55] Explicit valence for atom # 5 N, 4, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:31:55] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:31:55] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:31:55] Explicit valence for atom # 2 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:31:55] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:31:55] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:31:55] Explicit valence for atom # 4 C, 5, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:31:5

failed:  68


*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:32:07] Explicit valence for atom # 6 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:32:07] Explicit valence for atom # 2 N, 4, is greater than permitted
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:32:07] Explicit valence for atom # 0 N, 4, is greater than permitted
[09:32:07] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:32:08] Explicit valence for atom # 0 N, 4, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders

[09:32:08] Explicit valence for atom # 3 N, 4, is greater than permitted
[09:32:08] Explicit valence for atom # 3 N, 4, is greater than permitt

failed:  21


[09:32:11] Explicit valence for atom # 7 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 3 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 1 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 7 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 16 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 0 N, 4, is greater than permitted
*** Open Babel Error  in TetStereoToWedgeHash
  Failed to set stereochemistry as unable to find an available bond
[09:32:11] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:32:11] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:32:11

failed:  273


[09:32:15] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 1 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 8 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 8 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 1 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 7 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 4 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 0 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 5 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 1 N, 4, is g

failed:  84


[09:32:15] Explicit valence for atom # 6 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 8 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 0 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 7 N, 4, is greater than permitted
[09:32:15] Explicit valence for atom # 7 N, 4, is greater than permitted


In [6]:
mean, std = np.mean(labels_train), np.std(labels_train)
print(mean, std)
labels_train = (labels_train - mean) / std
labels_test = (labels_test - mean) / std

mean_mg, std_mg = np.mean(labels_train_mg), np.std(labels_train_mg)
print(mean_mg, std_mg)
labels_train_mg = (labels_train_mg - mean_mg) / std_mg
labels_test_mg = (labels_test_mg - mean_mg) / std_mg

mean_hydro, std_hydro = np.mean(labels_train_hydro), np.std(labels_train_hydro)
print(mean_hydro, std_hydro)
labels_train_hydro = (labels_train_hydro - mean_hydro) / std_hydro
labels_test_hydro = (labels_test_hydro - mean_hydro) / std_hydro

1.2260294410013282 1.2355208170491463
1.0205740223738984 1.2557467353670098
2.1019083829695977 0.6038074126342948


In [7]:
# Use "hist" for training the model.
reg = xgb.XGBRegressor(tree_method="hist", device="cuda")
reg.fit(rxn_fps_train, labels_train)

print(reg.score(rxn_fps_train, labels_train))
print(reg.score(rxn_fps_test, labels_test))

# get performance metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = reg.predict(rxn_fps_test)
print("R2: ", r2_score(labels_test * std, y_pred * std))
print("MSE: ", mean_squared_error(labels_test * std, y_pred * std) ** 0.5)
print("MAE: ", mean_absolute_error(labels_test * std, y_pred * std))

Parameters: { "device" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0.7887414895562983
0.6363562658467921
R2:  0.636356267112024
MSE:  0.7507226656644798
MAE:  0.44029398871223885


In [10]:

xgb_temp = xgb.XGBRegressor()
reg = BayesSearchCV(
    xgb_temp,
    {
        "colsample_bytree": Real(0.5, 0.99),
        "max_depth": Integer(5, 25),
        "lambda": Real(0, 0.25),
        "learning_rate": Real(0.1, 0.25),
        "alpha": Real(0, 0.2),
        "eta": Real(0, 0.1),
        "gamma": Real(0, 0.1),
        "n_estimators": Integer(50, 1000),
        "objective": ["reg:squarederror"],
        "tree_method": ["gpu_hist"],
    },
    n_iter=20,
    verbose=4,
    cv=3,
)
reg.fit(rxn_fps_train, labels_train)
y_pred = reg.predict(rxn_fps_test)

print("MSE: ", mean_squared_error(labels_test * std, y_pred * std))
print("R2: ", r2_score(labels_test * std, y_pred * std))
print("MAE: ", mean_absolute_error(labels_test * std, y_pred * std))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END alpha=0.1002811171723653, colsample_bytree=0.9712481284046772, eta=0.016425641543294737, gamma=0.06624432686625746, lambda=0.20032728690943713, learning_rate=0.15913003037480755, max_depth=14, n_estimators=358, objective=reg:squarederror, tree_method=gpu_hist;, score=0.628 total time=  10.0s
[CV 2/3] END alpha=0.1002811171723653, colsample_bytree=0.9712481284046772, eta=0.016425641543294737, gamma=0.06624432686625746, lambda=0.20032728690943713, learning_rate=0.15913003037480755, max_depth=14, n_estimators=358, objective=reg:squarederror, tree_method=gpu_hist;, score=0.612 total time=  10.0s
[CV 3/3] END alpha=0.1002811171723653, colsample_bytree=0.9712481284046772, eta=0.016425641543294737, gamma=0.06624432686625746, lambda=0.20032728690943713, learning_rate=0.15913003037480755, max_depth=14, n_estimators=358, objective=reg:squarederror, tree_method=gpu_hist;, score=-0.007 total time=  10.1s
Fitting 3 folds for ea

In [None]:
#MSE:  0.4820283885520108
#RMSE:  0.694282643
#R2:  0.6889790331002481
#MAE:  0.340998683465289

In [11]:
xgb_temp = xgb.XGBRegressor()
reg = BayesSearchCV(
    xgb_temp,
    {
        "colsample_bytree": Real(0.5, 0.99),
        "max_depth": Integer(5, 25),
        "lambda": Real(0, 0.25),
        "learning_rate": Real(0.1, 0.25),
        "alpha": Real(0, 0.2),
        "eta": Real(0, 0.1),
        "gamma": Real(0, 0.1),
        "n_estimators": Integer(50, 1000),
        "objective": ["reg:squarederror"],
        "tree_method": ["gpu_hist"],
    },
    n_iter=20,
    verbose=4,
    cv=3,
)
reg.fit(rxn_fps_train_mg, labels_train_mg)
y_pred = reg.predict(rxn_fps_test_mg)
print("MSE: ", mean_squared_error(labels_test_mg * std_mg, y_pred * std_mg))
print("R2: ", r2_score(labels_test_mg * std_mg, y_pred * std_mg))
print("MAE: ", mean_absolute_error(labels_test_mg * std_mg, y_pred * std_mg))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END alpha=0.022450115475468296, colsample_bytree=0.9245381353147815, eta=0.04797240646193848, gamma=0.07806839119535619, lambda=0.13406972737810138, learning_rate=0.2368939516109716, max_depth=12, n_estimators=266, objective=reg:squarederror, tree_method=gpu_hist;, score=0.633 total time=   5.5s
[CV 2/3] END alpha=0.022450115475468296, colsample_bytree=0.9245381353147815, eta=0.04797240646193848, gamma=0.07806839119535619, lambda=0.13406972737810138, learning_rate=0.2368939516109716, max_depth=12, n_estimators=266, objective=reg:squarederror, tree_method=gpu_hist;, score=0.616 total time=   5.5s
[CV 3/3] END alpha=0.022450115475468296, colsample_bytree=0.9245381353147815, eta=0.04797240646193848, gamma=0.07806839119535619, lambda=0.13406972737810138, learning_rate=0.2368939516109716, max_depth=12, n_estimators=266, objective=reg:squarederror, tree_method=gpu_hist;, score=0.610 total time=   5.4s
Fitting 3 folds for eac

In [None]:
#MSE:  0.5492101757679171
#RMSE: 0.741
#R2:  0.6593315325702007
#MAE:  0.3413843617195234

In [12]:
xgb_temp = xgb.XGBRegressor()
reg = BayesSearchCV(
    xgb_temp,
    {
        "colsample_bytree": Real(0.5, 0.99),
        "max_depth": Integer(5, 25),
        "lambda": Real(0, 0.25),
        "learning_rate": Real(0.1, 0.25),
        "alpha": Real(0, 0.2),
        "eta": Real(0, 0.1),
        "gamma": Real(0, 0.1),
        "n_estimators": Integer(50, 1000),
        "objective": ["reg:squarederror"],
        "tree_method": ["gpu_hist"],
    },
    n_iter=20,
    verbose=4,
    cv=3,
)
reg.fit(rxn_fps_train_hydro, labels_train_hydro)
y_pred = reg.predict(rxn_fps_test_hydro)
print("MSE: ", mean_squared_error(labels_test_hydro * std_hydro, y_pred * std_hydro))
print("R2: ", r2_score(labels_test_hydro * std_hydro, y_pred * std_hydro))
print("MAE: ", mean_absolute_error(labels_test_hydro * std_hydro, y_pred * std_hydro))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END alpha=0.17319780664614115, colsample_bytree=0.7989991530884877, eta=0.07264834801271969, gamma=0.046785715877991, lambda=0.14191916621599848, learning_rate=0.20227688708328806, max_depth=5, n_estimators=231, objective=reg:squarederror, tree_method=gpu_hist;, score=0.600 total time=   1.3s
[CV 2/3] END alpha=0.17319780664614115, colsample_bytree=0.7989991530884877, eta=0.07264834801271969, gamma=0.046785715877991, lambda=0.14191916621599848, learning_rate=0.20227688708328806, max_depth=5, n_estimators=231, objective=reg:squarederror, tree_method=gpu_hist;, score=0.591 total time=   1.3s
[CV 3/3] END alpha=0.17319780664614115, colsample_bytree=0.7989991530884877, eta=0.07264834801271969, gamma=0.046785715877991, lambda=0.14191916621599848, learning_rate=0.20227688708328806, max_depth=5, n_estimators=231, objective=reg:squarederror, tree_method=gpu_hist;, score=0.640 total time=   1.2s
Fitting 3 folds for each of 1 ca

In [None]:
#MSE:  0.10153259978392874
#RMSE:  0.318
#R2:  0.7206576284819086
#MAE:  0.17197174277365748