In [1]:
import pandas as pd 
from rdkit import Chem
# data
# data
df = pd.read_excel('../data/Suzuki-Miyaura/aap9112_Data_File_S1.xlsx')
df = df.fillna('None')    

reactant_1_smiles = {
    '6-chloroquinoline': 'C1=C(Cl)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-Bromoquinoline': 'C1=C(Br)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-triflatequinoline': 'C1C2C(=NC=CC=2)C=CC=1OS(C(F)(F)F)(=O)=O.CCC1=CC(=CC=C1)CC',
    '6-Iodoquinoline': 'C1=C(I)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
    '6-quinoline-boronic acid hydrochloride': 'C1C(B(O)O)=CC=C2N=CC=CC=12.Cl.O',
    'Potassium quinoline-6-trifluoroborate': '[B-](C1=CC2=C(C=C1)N=CC=C2)(F)(F)F.[K+].O',
    '6-Quinolineboronic acid pinacol ester': 'B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N=CC=C3.O'
}

reactant_2_smiles = {
    '2a, Boronic Acid': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B(O)O', 
    '2b, Boronic Ester': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B4OC(C)(C)C(C)(C)O4', 
    '2c, Trifluoroborate': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1[B-](F)(F)F.[K+]',
    '2d, Bromide': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1Br' 
}

catalyst_smiles = {
    'Pd(OAc)2': 'CC(=O)O~CC(=O)O~[Pd]'
}

ligand_smiles = {
    'P(tBu)3': 'CC(C)(C)P(C(C)(C)C)C(C)(C)C', 
    'P(Ph)3 ': 'c3c(P(c1ccccc1)c2ccccc2)cccc3', 
    'AmPhos': 'CC(C)(C)P(C1=CC=C(C=C1)N(C)C)C(C)(C)C', 
    'P(Cy)3': 'C1(CCCCC1)P(C2CCCCC2)C3CCCCC3', 
    'P(o-Tol)3': 'CC1=CC=CC=C1P(C2=CC=CC=C2C)C3=CC=CC=C3C',
    'CataCXium A': 'CCCCP(C12CC3CC(C1)CC(C3)C2)C45CC6CC(C4)CC(C6)C5', 
    'SPhos': 'COc1cccc(c1c2ccccc2P(C3CCCCC3)C4CCCCC4)OC', 
    'dtbpf': 'CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.[Fe]', 
    'XPhos': 'P(c2ccccc2c1c(cc(cc1C(C)C)C(C)C)C(C)C)(C3CCCCC3)C4CCCCC4', 
    'dppf': 'C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.[Fe+2]', 
    'Xantphos': 'O6c1c(cccc1P(c2ccccc2)c3ccccc3)C(c7cccc(P(c4ccccc4)c5ccccc5)c67)(C)C',
    'None': ''
}

reagent_1_smiles = {
    'NaOH': '[OH-].[Na+]', 
    'NaHCO3': '[Na+].OC([O-])=O', 
    'CsF': '[F-].[Cs+]', 
    'K3PO4': '[K+].[K+].[K+].[O-]P([O-])([O-])=O', 
    'KOH': '[K+].[OH-]', 
    'LiOtBu': '[Li+].[O-]C(C)(C)C', 
    'Et3N': 'CCN(CC)CC', 
    'None': ''
}

solvent_1_smiles = {
    'MeCN': 'CC#N.O', 
    'THF': 'C1CCOC1.O', 
    'DMF': 'CN(C)C=O.O', 
    'MeOH': 'CO.O', 
    'MeOH/H2O_V2 9:1': 'CO.O', 
    'THF_V2': 'C1CCOC1.O'
}

def make_reaction_smiles(row):
    precursors = f" {reactant_1_smiles[row['Reactant_1_Name']]}.{reactant_2_smiles[row['Reactant_2_Name']]}.{catalyst_smiles[row['Catalyst_1_Short_Hand']]}.{ligand_smiles[row['Ligand_Short_Hand']]}.{reagent_1_smiles[row['Reagent_1_Short_Hand']]}.{solvent_1_smiles[row['Solvent_1_Short_Hand']]} "
    product = 'C1=C(C2=C(C)C=CC3N(C4OCCCC4)N=CC2=3)C=CC2=NC=CC=C12'
#     print(precursors, product)
    can_precursors = Chem.MolToSmiles(Chem.MolFromSmiles(precursors.replace('...', '.').replace('..', '.').replace(' .', '').replace('. ', '').replace(' ', '')))
    can_product = Chem.MolToSmiles(Chem.MolFromSmiles(product))
    
    return f"{can_precursors}>>{can_product}"
     

In [2]:
# data
df['rxn']= [make_reaction_smiles(row) for i, row in df.iterrows()]
df['y'] = df['Product_Yield_PCT_Area_UV']/ 100.
reactions_df = df[['rxn', 'y']]

In [3]:
reactions_df['rxn'].describe

<bound method NDFrame.describe of 0       CC#N.CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C(C)(C)C)C...
1       CC#N.CC(=O)O~CC(=O)O~[Pd].CCc1cccc(CC)c1.Cc1cc...
2       CC#N.CC(=O)O~CC(=O)O~[Pd].CCc1cccc(CC)c1.CN(C)...
3       C1CCC(P(C2CCCCC2)C2CCCCC2)CC1.CC#N.CC(=O)O~CC(...
4       CC#N.CC(=O)O~CC(=O)O~[Pd].CCc1cccc(CC)c1.Cc1cc...
                              ...                        
5755    CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C1=CC=C[CH]1)C(...
5756    CC(=O)O~CC(=O)O~[Pd].CC(C)c1cc(C(C)C)c(-c2cccc...
5757    CC(=O)O~CC(=O)O~[Pd].CC1(C)OB(c2ccc3ncccc3c2)O...
5758    CC(=O)O~CC(=O)O~[Pd].CC1(C)OB(c2ccc3ncccc3c2)O...
5759    CC(=O)O~CC(=O)O~[Pd].CC1(C)OB(c2ccc3ncccc3c2)O...
Name: rxn, Length: 5760, dtype: object>

### Test Encoding/ Decoding efficiency of Mistral-7B provided tokenizer!

In [4]:
from sentencepiece import SentencePieceProcessor
from tokenizer import MistralTokenizer
# Load the Mistral-7B SentencePiece tokenizer model
#sp = SentencePieceProcessor()
#sp.load('../model_files/mistral-7B-v0.1/tokenizer.model')

sp = MistralTokenizer("../model_files/mistral-7B-v0.1/tokenizer.model")

In [5]:
# Tokenize a SMILES string
import numpy as np
count = 0 
total = float(len(reactions_df['rxn'].to_list()))
print(len(reactions_df['rxn'].to_list()))
for smi in reactions_df['rxn'].to_list() :
    tokenized_smiles = sp.encode(smi) #sp.encode(smi, out_type=str)
    decoded_smiles = sp.decode(tokenized_smiles)
    if count == 0:
        print("\nOriginal SMILES:", smi)
        print(tokenized_smiles)
        print("\nDecoded SMILES:", decoded_smiles)
    if smi == decoded_smiles :
        count += 1 
print("\nEncoding-Decoding accuracy of Mistral tokenizer for the reaction smiles is {} %".format(np.round((count/total)*100.,2)) )

5760

Original SMILES: CC#N.CC(=O)O~CC(=O)O~[Pd].CC(C)(C)P(C(C)(C)C)C(C)(C)C.CCc1cccc(CC)c1.Cc1ccc2c(cnn2C2CCCCO2)c1B(O)O.Clc1ccc2ncccc2c1.O.[Na+].[OH-]>>Cc1ccc2c(cnn2C2CCCCO2)c1-c1ccc2ncccc2c1
[1, 16900, 28771, 28759, 28723, 4020, 28732, 28746, 28762, 28731, 28762, 28845, 4020, 28732, 28746, 28762, 28731, 28762, 28845, 28792, 28753, 28715, 1592, 4020, 28732, 28743, 3847, 28743, 28731, 28753, 28732, 28743, 28732, 28743, 3847, 28743, 28731, 28743, 28731, 28743, 28732, 28743, 3847, 28743, 28731, 28743, 28723, 4020, 28717, 28740, 22827, 28732, 4020, 28731, 28717, 28740, 28723, 28743, 28717, 28740, 588, 28717, 28750, 28717, 28732, 28717, 9472, 28750, 28743, 28750, 4020, 28743, 1998, 28750, 28731, 28717, 28740, 28760, 28732, 28762, 28731, 28762, 28723, 1199, 28717, 28740, 588, 28717, 28750, 28711, 22827, 28750, 28717, 28740, 28723, 28762, 20011, 17552, 28806, 1592, 28792, 28762, 28769, 28733, 28793, 4060, 28743, 28717, 28740, 588, 28717, 28750, 28717, 28732, 28717, 9472, 28750, 28743, 28750