In [1]:
import pandas as pd
from alfabet import model

Using TensorFlow backend.
2021-07-31 12:00:25.805236: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2021-07-31 12:00:25.873865: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f9b055e8010 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-07-31 12:00:25.873883: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [2]:
import nfp

## Load the test dataset

In [3]:
test_data = pd.read_csv('test_data.csv.gz')
test_data.head()

Unnamed: 0,molecule,bond_index,fragment1,fragment2,bde,bond_type,delta_stereo
0,C#C/C(C)=C/CNCC,1,[C]#C,C[C]=CCNCC,132.918956,C-C,0.0
1,C#C/C(C)=C/CNCC,2,C#C/[C]=C/CNCC,[CH3],90.545156,C-C,0.0
2,C#C/C(C)=C/CNCC,4,[CH2]NCC,[CH]=C(C)C#C,92.585188,C-C,0.0
3,C#C/C(C)=C/CNCC,5,C#C/C(C)=C/[CH2],CC[NH],62.904012,C-N,0.0
4,C#C/C(C)=C/CNCC,6,C#C/C(C)=C/C[NH],[CH2]C,83.992707,C-N,0.0


In this test dataset and in the alfabet predictions, `bond_index` corresponds to the bond index assinged by RDKit for a molecule with explicit hydrogens

In [4]:
from rdkit import Chem
molH = Chem.AddHs(Chem.MolFromSmiles('C#C/C(C)=C/CNCC'))
bond = molH.GetBondWithIdx(5)
print(f'{bond.GetBeginAtom().GetSymbol()}-{bond.GetEndAtom().GetSymbol()}')

C-N


Get a list of unique molecules in the test dataset

In [5]:
test_molecules = test_data.molecule.unique()
test_molecules[:5]

array(['C#C/C(C)=C/CNCC', 'C#C/C=C/C(=O)O', 'C#C/C=C/C=C/C=C/C',
       'C#C/C=C/COCCCC', 'C#CC#CCCO'], dtype=object)

`model.predict(...)` expects a list-like object of SMILES strings. The longest part of the calculation is breaking the bonds in the molecule and generating radical SMILES, which uses the `joblib` library to parallelize these calculations across multiple cores

In [6]:
alfabet_predictions = model.predict(test_molecules, verbose=False)










































































































































































[




































































































[12:01:59




















































































































































[12:0





































































In [7]:
alfabet_predictions.head()

Unnamed: 0,molecule,bond_index,bond_type,fragment1,fragment2,delta_assigned_stereo,delta_unassigned_stereo,bde_pred,is_valid
0,C#C/C(C)=C/CNCC,1,C-C,[C]#C,C[C]=CCNCC,0,0,132.957993,True
1,C#C/C(C)=C/CNCC,2,C-C,C#C/[C]=C/CNCC,[CH3],0,0,90.212402,True
2,C#C/C(C)=C/CNCC,4,C-C,[CH2]NCC,[CH]=C(C)C#C,0,0,92.262215,True
3,C#C/C(C)=C/CNCC,5,C-N,C#C/C(C)=C/[CH2],CC[NH],0,0,62.995834,True
4,C#C/C(C)=C/CNCC,6,C-N,C#C/C(C)=C/C[NH],[CH2]C,0,0,84.394958,True


Assert that the input molecules fall within the model's domain of validity.

In [8]:
alfabet_predictions.is_valid.all()

True

Merge the test data and model predictions

In [9]:
merged_predictions = test_data.merge(
    alfabet_predictions, on=['molecule', 'fragment1', 'fragment2'], how='left')

Calculate the MAE in kcal/mol

In [10]:
(merged_predictions.bde - merged_predictions.bde_pred).abs().mean()

0.5975915681620032

In [11]:
alfabet_predictions

Unnamed: 0,molecule,bond_index,bond_type,fragment1,fragment2,delta_assigned_stereo,delta_unassigned_stereo,bde_pred,is_valid
0,C#C/C(C)=C/CNCC,1,C-C,[C]#C,C[C]=CCNCC,0,0,132.957993,True
1,C#C/C(C)=C/CNCC,2,C-C,C#C/[C]=C/CNCC,[CH3],0,0,90.212402,True
2,C#C/C(C)=C/CNCC,4,C-C,[CH2]NCC,[CH]=C(C)C#C,0,0,92.262215,True
3,C#C/C(C)=C/CNCC,5,C-N,C#C/C(C)=C/[CH2],CC[NH],0,0,62.995834,True
4,C#C/C(C)=C/CNCC,6,C-N,C#C/C(C)=C/C[NH],[CH2]C,0,0,84.394958,True
...,...,...,...,...,...,...,...,...,...
9262,c1noc2c1CCCC2,10,C-H,[H],[c]1noc2c1CCCC2,0,0,114.645737,True
9263,c1noc2c1CCCC2,11,C-H,[H],[CH]1CCCc2oncc21,0,0,88.869385,True
9264,c1noc2c1CCCC2,13,C-H,[H],[CH]1CCc2oncc2C1,0,0,97.773438,True
9265,c1noc2c1CCCC2,15,C-H,[H],[CH]1CCc2cnoc2C1,0,0,97.593796,True
