In [None]:
'''
    This notebook is used to merge exported data from Reaxys, 
    clean the data, filter, tokenize and preprocess the dataset 
    for the training of the Enzymatic Transformer available at
    https://github.com/reymond-group/OpenNMT-py

    The environment is detailed on GitHub.

    Initial .xls Reaxys extracted files are placed in /data
    Dataset is output in /dataset

    (The code is not perfectly clean and optimized, some steps might take 
    some time. Need around 4 min for an 83k reaction initial Reaxys extract)

'''

In [1]:
import pandas as pd
import glob, os
from tqdm import tqdm 

In [2]:
'''     COLLECT ALL EXCELS CSV SUB FILES AND MERGE THEM INTO A UNIQUE DATABASE             '''

directory = str(os.getcwd()) + "/Data"
iteration= 0

for filename in os.listdir(directory):
    if filename.endswith(".xls"):
        if iteration == 0:
            df = pd.read_csv('Data/' + filename, sep='\t')
        else:
            df2 = pd.read_csv('Data/' + filename, sep='\t')
            df = df.append(df2, 2)
        iteration +=1

print("Total enteries in merged DF: \t" + str(len(df)))

Total enteries in merged DF: 	83946


In [3]:
'''         FILTER REACTIONS THAT ARE NOT COMPLETE:           '''
#Copy:
df_filter = df.reset_index(drop=True)
df_filter['Drop'] = ''

count_no_Brak = 0
count_not_2 = 0
count_no_reactant = 0
count_no_product = 0
count_no_catalyst_nor_reagent_text = 0

remaining = 0

totalenteries = 0
reaction_smiles = []

for item in range(0, len(df_filter)-1):
    reaction = df_filter.at[int(item), 'Reaction']
    reagents = str(df_filter.at[int(item), 'Reagent'])
    catalysts = str(df_filter.at[int(item), 'Catalyst'])

    totalenteries += 1
    if not ">" in str(reaction):            #at least one >
        count_no_Brak += 1
        df_filter['Drop'][item] = '1'
        continue
    if not ">>" in str(reaction):           #
        count_not_2 += 1
        df_filter['Drop'][item] = '1'
        continue
    if str(reaction).split(">>")[0] == "":
        count_no_reactant += 1
        df_filter['Drop'][item] = '1'
        continue
    if str(reaction).split(">>")[1] == "":
        count_no_product += 1
        df_filter['Drop'][item] = '1'
        continue

    if reagents == "nan":
        reagents = ""
    if catalysts == "nan":
        catalysts = ""

    if reagents == "" and catalysts == "":
        count_no_catalyst_nor_reagent_text += 1
        df_filter['Drop'][item] = '1'
        continue

    remaining += 1
    reaction_smiles.append(reaction)
    
print(str(count_no_Brak) + "\t no braket at all, means not a reaction at all")
print(str(count_not_2) + "\t no '>>' together, means potential reactants between both '>'")
print(str(count_no_reactant) + "\t no reactant at all before '>>")
print(str(count_no_product) + "\t no products at all after '>>'")
print(str(count_no_catalyst_nor_reagent_text) + "\t no catalyst or reagent description")
print('')
print(str(totalenteries) + "\t total enteries")
print(str(remaining) + "\t remaining enteries")
print(str(len(set(reaction_smiles))) + "\t unique reaction SMILES")

2362	 no braket at all, means not a reaction at all
0	 no '>>' together, means potential reactants between both '>'
1461	 no reactant at all before '>>
7446	 no products at all after '>>'
775	 no catalyst or reagent description

83945	 total enteries
71901	 remaining enteries
57781	 unique reaction SMILES


In [4]:
'''         Delete useless columns      '''

df_filter2 = df_filter[df_filter['Drop'] == ''].reset_index(drop=True)
del df_filter
df_filter2 = df_filter2.drop_duplicates(subset=["Catalyst", "Reagent", "Reaction"]).reset_index(drop=True)

del df_filter2["Reaction: Links to Reaxys"]
del df_filter2["Data Count"]
del df_filter2["Number of Reaction Details"]
del df_filter2["Reaction Rank"]
del df_filter2["Record Type"]
del df_filter2["Bin"]
del df_filter2["Reaction Details: Reaction Classification"]
del df_filter2["Example label"]
del df_filter2["Multi-step Scheme"]
del df_filter2["Multi-step Details"]
del df_filter2["Named Reaction"]
del df_filter2["Type of reaction description (Reaction Details)"]
del df_filter2["Location"]
del df_filter2["References"]
del df_filter2["Unnamed: 41"]
del df_filter2["Links to Reaxys"]

df_filter2.shape

(70096, 27)

In [5]:
'''         CREATE NEW COLUMN: Add to "Enzyme Keyword" the FULL ";" splitted element from Reagent or Catalyst           '''

df_filter4 = df_filter2
df_filter4["Enzyme Keyword"] = ""

#Keyword that are allowed to pass the filter:
White_List = ["Ase", "ase", "lysozyme"]

#For each reaction entery:
for item in range(0, len(df_filter4)):
    reagents = str(df_filter4.at[int(item), 'Reagent'])
    catalysts = str(df_filter4.at[int(item), 'Catalyst'])
    list_reag_cat = []
    
    #Concatenate Catalysts and Reagents:
    for words in reagents.split("; "):
        list_reag_cat.append(words)
    for words in catalysts.split("; "):
        list_reag_cat.append(words)

    for white in White_List:
        for element in list_reag_cat:
            if str(white).casefold() in str(element).casefold():
                if not str(element).casefold() in df_filter4["Enzyme Keyword"][item]:
                    if df_filter4["Enzyme Keyword"][item] == "":
                        df_filter4["Enzyme Keyword"][item] = [str(element).casefold()]
                    else:
                        df_filter4["Enzyme Keyword"][item].append(str(element).casefold())

In [6]:
'''         CREATE NEW COLUMN: EXTRACTED ENZYME SINGLE NAME ONLY            '''

df_filter5 = df_filter4
df_filter5["Enzyme Name"] = ""

#Keyword that are allowed to pass the filter:
White_List = ["Ase", "ase", "lysozyme"]

#List of ENZYME SINGLE WORD:
Enzyme_ASE = []

#For each reaction entery:
for item in range(0, len(df_filter5)):
    reagents = str(df_filter5.at[int(item), 'Reagent'])
    catalysts = str(df_filter5.at[int(item), 'Catalyst'])

    list_reag_cat = []
    
    #Concatenate Catalysts and Reagents:
    for sentenses in reagents.split("; "):
        for word in sentenses.split(" "):
            list_reag_cat.append(word)
    for sentenses in catalysts.split("; "):
        for word in sentenses.split(" "):
            list_reag_cat.append(word)

    #for each element in the White List:
    for white in White_List:
        for element in list_reag_cat:
            if str(white).casefold() in str(element).casefold():
                
                if not str(element).casefold() in df_filter5["Enzyme Name"][item]:
                    Enzyme_ASE.append(element)
                    if df_filter5["Enzyme Name"][item] == "":
                        df_filter5["Enzyme Name"][item] = [str(element).casefold()]
                    else:
                        df_filter5["Enzyme Name"][item].append(str(element).casefold())

print("Enzyme presents: ", len(Enzyme_ASE))
print("UNIQUE Enzyme presents: ", len(set(Enzyme_ASE)))

Enzyme presents:  57860
UNIQUE Enzyme presents:  1722


In [7]:
'''     Replace list by a string for "Enzyme Name"         '''

df_filter5_2 = df_filter5

for element in tqdm(range(0, len(df_filter5_2["Enzyme Name"]))):
    df_filter5_2["Enzyme Name"][element] = ' '.join(df_filter5_2["Enzyme Name"][element])
for element in tqdm(range(0, len(df_filter5_2["Enzyme Keyword"]))):
    df_filter5_2["Enzyme Keyword"][element] = ' '.join(df_filter5_2["Enzyme Keyword"][element])

100%|██████████| 70096/70096 [00:19<00:00, 3506.28it/s]
100%|██████████| 70096/70096 [00:16<00:00, 4143.05it/s]


In [8]:
'''     Cleaning Names        '''

df_filter6 = df_filter5_2

for element in tqdm(range(0, len(df_filter6["Enzyme Name"]))):
    if "(" in df_filter6["Enzyme Name"][element]:
        if not ")" in df_filter6["Enzyme Name"][element]:
            df_filter6["Enzyme Name"][element] = df_filter6["Enzyme Name"][element].replace("(", "")
    elif ")" in df_filter6["Enzyme Name"][element]:
        if not "(" in df_filter6["Enzyme Name"][element]:
            df_filter6["Enzyme Name"][element] = df_filter6["Enzyme Name"][element].replace(")", "")

    df_filter6["Enzyme Name"][element] = df_filter6["Enzyme Name"][element].casefold()

    if "ase," in df_filter6["Enzyme Name"][element]:
        df_filter6["Enzyme Name"][element] = df_filter6["Enzyme Name"][element].replace("ase,", "ase")
    if "ases," in df_filter6["Enzyme Name"][element]:
        df_filter6["Enzyme Name"][element] = df_filter6["Enzyme Name"][element].replace("ases", "ase")

for element in tqdm(range(0, len(df_filter6["Enzyme Keyword"]))):
    if "(" in df_filter6["Enzyme Keyword"][element]:
        if not ")" in df_filter6["Enzyme Keyword"][element]:
            df_filter6["Enzyme Keyword"][element] = df_filter6["Enzyme Keyword"][element].replace("(", "")
    elif ")" in df_filter6["Enzyme Keyword"][element]:
        if not "(" in df_filter6["Enzyme Keyword"][element]:
            df_filter6["Enzyme Keyword"][element] = df_filter6["Enzyme Keyword"][element].replace(")", "")

    df_filter6["Enzyme Keyword"][element] = df_filter6["Enzyme Keyword"][element].casefold()

    if "ase," in df_filter6["Enzyme Keyword"][element]:
        df_filter6["Enzyme Keyword"][element] = df_filter6["Enzyme Keyword"][element].replace("ase,", "ase")
    if "ases," in df_filter6["Enzyme Keyword"][element]:
        df_filter6["Enzyme Keyword"][element] = df_filter6["Enzyme Keyword"][element].replace("ases", "ase")

100%|██████████| 70096/70096 [00:19<00:00, 3526.09it/s]
100%|██████████| 70096/70096 [00:20<00:00, 3458.98it/s]


In [9]:
'''         PREPROCESSING DATASETS            '''

import pandas as pd
import numpy as np
import glob, os
from tqdm import tqdm 

from itertools import groupby
import random

from collections import Counter

import re
from rdkit import Chem


from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# Initialize a tokenizer
tokenizer2 = Tokenizer(models.BPE())

# Customize pre-tokenization and decoding
tokenizer2.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer2.decoder = decoders.ByteLevel()
tokenizer2.post_processor = processors.ByteLevel(trim_offsets=True)

# And then train
trainer = trainers.BpeTrainer(vocab_size=9000, min_frequency=2, limit_alphabet=55, special_tokens=['ase', 'hydro', 'mono', 'cyclo', 'thermo', 'im'])
tokenizer2.train(trainer, ["Tokenizer/Enzyme_Name_ForTocken.txt"])

In [10]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

def enzyme_sentence_tokenizer(sentence):
    '''
    Tokenize a sentenze, optimized for enzyme-like descriptions & names
    '''
    encoded = tokenizer2.encode(sentence)
    my_list = [item for item in encoded.tokens if 'Ġ' != item]
    my_list = [item.replace('Ġ', '_') for item in my_list]
    my_list = ' '.join(my_list)
    return my_list

def Canonicalize_Reaction(smiles):
    
    for elements in smiles.replace('>>', '>').replace('>', '.').split('.'):
        m = Chem.MolFromSmiles(elements, sanitize=False)
        if m is None:
            return False
    
    precursors = smiles.split('>')[0]
    reactants = smiles.split('>')[1]
    products = smiles.split('>')[2]

    can_precursors = []
    can_reactants = []
    can_products = []

    for precurs in precursors.split('.'):
        mol = Chem.MolFromSmiles(precurs)
        can_precursors.append(Chem.MolToSmiles(mol, canonical=True))
    for reactant in reactants.split('.'):
        mol = Chem.MolFromSmiles(reactant)
        can_reactants.append(Chem.MolToSmiles(mol, canonical=True))
    for product in products.split('.'):
        mol = Chem.MolFromSmiles(product)
        can_products.append(Chem.MolToSmiles(mol, canonical=True))

    canon = '.'.join(can_precursors) + '>' + '.'.join(can_reactants) + '>' + '.'.join(can_products)

    return canon

In [11]:
'''         FILTER DATAFRAME      

SELECTION_ENZYME_DESC = 'Enzyme Keyword'    for full sentences        
SELECTION_ENZYME_DESC = 'Enzyme Name'       for "-ase" word only  

'''

SELECTION_ENZYME_DESC = 'Enzyme Keyword'        

df_filter = df_filter6[['Reaction', SELECTION_ENZYME_DESC]].copy()
df_filter.shape

(70096, 2)

In [12]:
'''         Remove reaction with no Enzyme Name     '''

df_filter.dropna(subset = [SELECTION_ENZYME_DESC], inplace=True)
df_filter = df_filter[df_filter['Enzyme Keyword'].str.len() > 0]
df_filter.drop_duplicates(inplace=True)
df_filter.reset_index(inplace=True)
del df_filter['index']
df_filter.shape

(48304, 2)

In [13]:
'''             CANONICALIZE ALL REACTIONS AND REMOVE INVALIDS         '''
count = 0
for element in tqdm(range(0, len(df_filter))):
    try:
        reaction = Canonicalize_Reaction(df_filter['Reaction'][element])
        if reaction != False:
            df_filter['Reaction'][element] = reaction
    except:
        df_filter['Reaction'][element] = 'invalid'
        count += 1

df_filter = df_filter[~df_filter['Reaction'].str.contains('invalid')].reset_index(drop=True)
print(count, " invalid smiles reactions removed")

100%|██████████| 48304/48304 [00:57<00:00, 836.28it/s]
100  invalid smiles reactions removed


In [14]:
'''         CHECK AGAIN FOR CANONICAL DUPLICATES       '''
print("Before ", len(df_filter))
df_filter = df_filter.drop_duplicates(subset=["Enzyme Keyword", "Reaction"]).reset_index(drop=True)
print("After ", len(df_filter))

Before  48204
After  48203


In [15]:
'''                 Remove reaction with Multiple products                                      '''
count = 0

df_filter['Reaction_Multi'] = ""
for element in range(0, len(df_filter)):
    if '.' in df_filter['Reaction'][element].split('>>')[1]:
        df_filter['Reaction_Multi'][element] = 'DROP'
        count += 1  

print('Removed: ', count, ' reactions')

indexNames = df_filter[df_filter['Reaction_Multi'] == 'DROP'].index
df_filter.drop(indexNames, inplace=True)

df_filter.reset_index(inplace=True)
del df_filter['Reaction_Multi']
del df_filter['index']

print("Initially: " + str(len(set(df['Reaction']))) + " reactions")
print("After removing 'NaN' Enzyme Names, remains: " + str(len(set(df_filter['Reaction']))) + " unique reactions, " + str(len(df_filter)) + " reactions in total")

Removed:  16022  reactions
Initially: 63664 reactions
After removing 'NaN' Enzyme Names, remains: 27953 unique reactions, 32181 reactions in total


In [16]:
'''     Combine Enzyme Name and Reaction SMILES     '''

df_filter['Product'] = ''
df_filter['TransformerIn'] = ''
df_filter['TransformerOut'] = ''

for index in tqdm(range(0, len(df_filter))):
    df_filter['Product'][index] = df_filter['Reaction'][index].split('>>')[1]
    df_filter['TransformerIn'][index] = smi_tokenizer(df_filter['Reaction'][index].split('>>')[0]) + " > " + enzyme_sentence_tokenizer(df_filter[SELECTION_ENZYME_DESC][index])
    df_filter['TransformerOut'][index] = smi_tokenizer(df_filter['Product'][index])

100%|██████████| 32181/32181 [00:26<00:00, 1228.58it/s]


In [17]:
print(str(len(set(df_filter['Product']))) + " different Products are presents")

23335 different Products are presents


In [18]:
'''         Distribute the PRODUCTS with weight            '''

Products_org = df_filter['Product']

count_train = 0
count_test = 0
count_val = 0

Products_org = pd.DataFrame.from_dict(Counter(Products_org), orient='index').reset_index().rename(columns={0: 'count'})
df_shuffled = Products_org.sample(len(Products_org)).reset_index()

df_shuffled["set"] = ""

for item in tqdm(range(0, len(df_shuffled))):
    
    #In case 
    if count_train != 0 and count_test != 0 and count_val != 0:
        ratio_train = (count_train / (count_train + count_test + count_val)) / 0.8
        ratio_test = (count_test / (count_train + count_test + count_val)) / 0.1
        ratio_val = (count_val / (count_train + count_test + count_val)) / 0.1

        if ratio_train < 1:
            count_train += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Train"
        elif ratio_test < 1:
            count_test += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Test"
        elif ratio_val < 1:
            count_val += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Val"
        else:
            assignation = random.randint(1, 3)

            if assignation == 1:
                count_train += df_shuffled['count'][item]
                df_shuffled['set'][item] = "Train"
            elif assignation == 2:
                count_test += df_shuffled['count'][item]
                df_shuffled['set'][item] = "Test"
            elif assignation == 3:
                count_val += df_shuffled['count'][item]
                df_shuffled['set'][item] = "Val"

    #In case no assignment yet:
    else:
        assignation = random.randint(1, 3)

        if assignation == 1:
            count_train += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Train"
        elif assignation == 2:
            count_test += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Test"
        elif assignation == 3:
            count_val += df_shuffled['count'][item]
            df_shuffled['set'][item] = "Val"


if count_train != 0 and count_test != 0 and count_val != 0:
    print('Train proportion: ', str(round(count_train / (count_train + count_test + count_val), 4)), '%')
    print('Test proportion: ', str(round(count_test / (count_train + count_test + count_val), 4)), '%')
    print('Val proportion: ', str(round(count_val / (count_train + count_test + count_val), 4)), '%')

df_shuffled = df_shuffled.set_index("index")

100%|██████████| 23335/23335 [00:02<00:00, 8068.61it/s]
Train proportion:  0.7997 %
Test proportion:  0.1002 %
Val proportion:  0.1001 %


In [19]:
'''     Assign the assignment of the Product to the initial DF (df_filter)       '''

df_filter["Set"] = ''

for item_toset in tqdm(range(0, len(df_filter))):
    product = df_filter['Product'][item_toset]
    df_filter['Set'][item_toset] = df_shuffled.loc[[product]]['set'][0]

df_filter.head(5)

100%|██████████| 32181/32181 [00:16<00:00, 1899.49it/s]


Unnamed: 0,Reaction,Enzyme Keyword,Product,TransformerIn,TransformerOut,Set
0,CC(=O)C(C)O>>CC(O)C(C)O,glucose dehydrogenase (r)-specific alcohol deh...,CC(O)C(C)O,C C ( = O ) C ( C ) O > _glucose _de hydro _ge...,C C ( O ) C ( C ) O,Train
1,CC(=O)C(C)O>>CC(O)C(C)O,rabbit 3-hydroxyhexobarbital dehydrogenase (ak...,CC(O)C(C)O,C C ( = O ) C ( C ) O > _rabbit _3 - hydro _x ...,C C ( O ) C ( C ) O,Train
2,CC(=O)C(C)O>>CC(O)C(C)O,"2,3-butanediol dehydrogenase from taiwanofungu...",CC(O)C(C)O,"C C ( = O ) C ( C ) O > _2 , 3 - butanediol _d...",C C ( O ) C ( C ) O,Train
3,O=C1CCCCC1>>O=C1CCCCCO1,glucose dehydrogenase,O=C1CCCCCO1,O = C 1 C C C C C 1 > _glucose _de hydro _gen ase,O = C 1 C C C C C O 1,Val
4,O=C1CCCCC1>>O=C1CCCCCO1,recombinant fusion protein cyclohexanonemonoox...,O=C1CCCCCO1,O = C 1 C C C C C 1 > _recombinant _fusion _pr...,O = C 1 C C C C C O 1,Val


In [20]:
'''     Distribute the Train / Test / Val splits into lists AND write into files          '''

count_train_2 = 0
count_test_2 = 0
count_val_2 = 0

TRAIN = []
TEST = []
VAL = []
for item_toset in range(0, len(df_filter)):
    if df_filter['Set'][item_toset] == 'Train':
        count_train_2+=1
        TRAIN.append(df_filter['TransformerIn'][item_toset] + '¢' + df_filter['TransformerOut'][item_toset])

    if df_filter['Set'][item_toset] == 'Test':
        count_test_2+=1
        TEST.append(df_filter['TransformerIn'][item_toset] + '¢' + df_filter['TransformerOut'][item_toset])

    if df_filter['Set'][item_toset] == 'Val':
        count_val_2+=1
        VAL.append(df_filter['TransformerIn'][item_toset] + '¢' + df_filter['TransformerOut'][item_toset])

np.random.shuffle(TRAIN)
np.random.shuffle(TEST)
np.random.shuffle(VAL)

src_train = []
tgt_train = []

src_test = []
tgt_test = []

src_val = []
tgt_val = []


for element in TRAIN:
    src_train.append(element.split('¢')[0])
    tgt_train.append(element.split('¢')[1])
for element in TEST:
    src_test.append(element.split('¢')[0])
    tgt_test.append(element.split('¢')[1])
for element in VAL:
    src_val.append(element.split('¢')[0])
    tgt_val.append(element.split('¢')[1])

print(count_train_2 / (count_train_2 + count_test_2 + count_val_2))
print(count_test_2 / (count_train_2 + count_test_2 + count_val_2))
print(count_val_2 / (count_train_2 + count_test_2 + count_val_2))

0.7996954724837637
0.10024548646717008
0.10005904104906622


In [21]:
'''             WRITE TO FILES          '''

target_folder_name = 'dataset/ENZR_Dataset_Full_Sentences/'

#WRITE INTO FILES:
with open(target_folder_name + 'src_train.txt', 'w') as f:
    for item in src_train:
        f.write("%s\n" % item)
with open(target_folder_name + 'tgt_train.txt', 'w') as f:
    for item in tgt_train:
        f.write("%s\n" % item)

with open(target_folder_name + 'src_test.txt', 'w') as f:
    for item in src_test:
        f.write("%s\n" % item)
with open(target_folder_name + 'tgt_test.txt', 'w') as f:
    for item in tgt_test:
        f.write("%s\n" % item)

with open(target_folder_name + 'src_val.txt', 'w') as f:
    for item in src_val:
        f.write("%s\n" % item)
with open(target_folder_name + 'tgt_val.txt', 'w') as f:
    for item in tgt_val:
        f.write("%s\n" % item)