In [2]:
import pandas as pd
import numpy as np
from os.path import join
import os
from rdkit import Chem
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors
from bioservices import *
from data_preprocessing import *
import warnings
from torch import load
warnings.filterwarnings('ignore')

CURRENT_DIR = os.getcwd()

## 1. Merging the kcat datasets from BRENDA, Sabio-RK, and UniProt:

### (a) Loading BRENDA data

In [3]:
df_Brenda = pd.read_pickle(join("..", "..", "data", "kcat_data", "BRENDA_kcat.pkl"))
#adding reaction information:
df_Brenda.rename(columns = {"correct reaction ID" : "BRENDA reaction ID"})


df_Brenda["Uniprot ID"] = np.nan
for ind in df_Brenda.index:
    try:
        df_Brenda["Uniprot ID"][ind] = df_Brenda["UNIPROT_list"][ind][0]
    except IndexError:
        pass
    
df_Brenda = df_Brenda.loc[~pd.isnull(df_Brenda["Uniprot ID"])]

    
df_Brenda.drop(columns = ["index", "ID", "comment", "kcat", "kcat_new", "enzyme",
                         "new", "LITERATURE", "UNIPROT_list", "new enzyme"],
               inplace = True)

df_Brenda.rename(columns = {"correct kcat" : "kcat", "correct reaction ID" : "BRENDA reaction ID",
                           "substrate_ID_list" : "substrate_IDs", 
                           "product_ID_list" : "product_IDs"}, inplace = True)


print("Number of data points: %s" % len(df_Brenda))
print("Number of UniProt IDs: %s" % len(set(df_Brenda["Uniprot ID"])))
print("Number of checked data points: %s" % len(df_Brenda.loc[df_Brenda["checked"]]))
print("Number of unchecked data points: %s" % len(df_Brenda.loc[~df_Brenda["checked"]]))


df_Brenda["from BRENDA"] = 1
df_Brenda["from Uniprot"] = 0
df_Brenda["from Sabio"] = 0
df_Brenda.head()

Number of data points: 8267
Number of UniProt IDs: 3149
Number of checked data points: 3611
Number of unchecked data points: 4656


Unnamed: 0,EC,ORGANISM,PMID,BRENDA reaction ID,kcat,checked,#UIDs,substrate_IDs,product_IDs,Uniprot ID,from BRENDA,from Uniprot,from Sabio
44,1.1.1.363,Leuconostoc mesenteroides,1304341.0,1485,1125.0,True,1,[InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-1...,"[InChI=1S/p+1, InChI=1S/C21H29N7O14P2/c22-17-1...",P11411,1,0,0
45,3.6.1.1,Thermoplasma acidophilum,1327774.0,26801,2200.0,True,1,"[InChI=1S/H2O/h1H2, InChI=1S/H4O7P2/c1-8(2,3)7...","[InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3]",P37981,1,0,0
64,2.5.1.7,Escherichia coli,1512209.0,12872,4.75,True,1,"[InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...","[InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3, ...",P0A749,1,0,0
65,1.12.98.2,Methanothermobacter marburgensis,1521540.0,3791,1462.0,True,1,"[InChI=1S/p+1, InChI=1S/C31H45N6O16P/c1-13-22-...","[InChI=1S/H2/h1H, InChI=1S/C31H43N6O16P/c1-13-...",P32440,1,0,0
84,4.1.3.40,Escherichia coli,1644758.0,28554,0.82,True,1,[InChI=1S/C10H10O6/c1-5(9(12)13)16-8-4-6(10(14...,"[InChI=1S/C3H4O3/c1-2(4)3(5)6/h1H3,(H,5,6)/p-1...",P26602,1,0,0


### (b) Loading Sabio data

In [4]:
df_Sabio = pd.read_pickle(join("..", "..", "data", "kcat_data", "Sabio_kcat.pkl"))
df_Sabio.drop(columns = ["unit", "complete", "KEGG ID"], inplace = True)
df_Sabio.rename(columns = {"products_IDs": "product_IDs"}, inplace = True)

print("Number of data points: %s" % len(df_Sabio))
print("Number of UniProt IDs: %s" % len(set(df_Sabio["Uniprot ID"])))

df_Sabio["checked"] = False
df_Sabio["#UIDs"] = 1
df_Sabio["complete"] = True
 
df_Sabio["from BRENDA"] = 0
df_Sabio["from Uniprot"] = 0
df_Sabio["from Sabio"] = 1
df_Sabio.head()

Number of data points: 2830
Number of UniProt IDs: 289


Unnamed: 0,Uniprot ID,kcat,Substrates,Products,PMID,substrate_IDs,product_IDs,checked,#UIDs,complete,from BRENDA,from Uniprot,from Sabio
0,P20932,2.8,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1
1,P20932,0.05,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1
2,P20932,0.11,Riboflavin-5-phosphate;(S)-Mandelate,alpha-Oxo-benzeneacetic acid;Reduced FMN,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1
3,P20932,205.0,Riboflavin-5-phosphate;(S)-Mandelate,alpha-Oxo-benzeneacetic acid;Reduced FMN,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1
4,P20932,2.3,Riboflavin-5-phosphate;(S)-Mandelate,alpha-Oxo-benzeneacetic acid;Reduced FMN,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1


### (c) Loading UniProt data:

In [5]:
df_Uniprot = pd.read_pickle(join("..", "..", "data", "kcat_data", "Uniprot_kcat.pkl"))

df_Uniprot.drop(columns = ["unit", "reaction ID"], inplace = True)
df_Uniprot.rename(columns = {"substrate CHEBI IDs" : "Substrates", "product CHEBI IDs" : "Products", 
                            "substrate InChIs" : "substrate_IDs", "product InChIs" : "product_IDs",
                            "kcat [1/sec]" : "kcat"}, inplace = True)

print("Number of data points: %s" % len(df_Uniprot))
print("Number of UniProt IDs: %s" % len(set(df_Uniprot["Uniprot ID"])))

df_Uniprot["checked"] = False
df_Uniprot["#UIDs"] = 1

df_Uniprot["from BRENDA"] = 0
df_Uniprot["from Uniprot"] = 1
df_Uniprot["from Sabio"] = 0
df_Uniprot.head()

Number of data points: 1738
Number of UniProt IDs: 1054


Unnamed: 0,kcat,Uniprot ID,Substrates,Products,substrate_IDs,product_IDs,complete,checked,#UIDs,from BRENDA,from Uniprot,from Sabio
0,30.0,P55217,"[CHEBI:35235, CHEBI:57661]","[CHEBI:15378, CHEBI:58161, CHEBI:30031]","[InChI=1S/C3H7NO2S/c4-2(1-7)3(5)6/h2,7H,1,4H2,...","[InChI=1S/p+1, InChI=1S/C7H14N2O4S/c8-4(6(10)1...",True,False,1,0,1,0
2,114.0,O24721,"[CHEBI:15992, CHEBI:15379]","[CHEBI:58794, CHEBI:15378]",[InChI=1S/C11H8O3/c12-10-8-4-2-1-3-7(8)5-6-9(1...,[InChI=1S/C11H8O5/c12-9(11(15)16)6-5-7-3-1-2-4...,True,False,1,0,1,0
4,32.0,Q79EM7,"[CHEBI:58203, CHEBI:15377, CHEBI:57540]","[CHEBI:15378, CHEBI:57945, CHEBI:17563]",[InChI=1S/C8H6O3/c9-5-6-3-1-2-4-7(6)8(10)11/h1...,"[InChI=1S/p+1, InChI=1S/C21H29N7O14P2/c22-17-1...",True,False,1,0,1,0
5,0.021667,F5BFC8,"[CHEBI:59560, CHEBI:58095, CHEBI:15379]","[CHEBI:15642, CHEBI:78290]",[InChI=1S/C9H15N5O3/c1-3(15)6(16)4-2-11-7-5(12...,[InChI=1S/C9H15N5O4/c1-3(15)5(16)4-2-11-6-9(18...,True,False,1,0,1,0
7,21.0,P77366,[CHEBI:57684],[CHEBI:58247],[InChI=1S/C6H13O9P/c7-1-2-3(8)4(9)5(10)6(14-2)...,"[InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-...",True,False,1,0,1,0


### (d) Merging all three datasets

In [6]:
df_kcat = pd.concat([pd.concat([df_Sabio, df_Brenda], ignore_index = True), df_Uniprot], ignore_index = True)
df_kcat =df_kcat.loc[~pd.isnull(df_kcat["kcat"])]

print("Number of data points: %s" % len(df_kcat))
print("Number of UniProt IDs: %s" % len(set(df_kcat["Uniprot ID"])))
df_kcat.to_pickle(join("..", "..", "data", "kcat_data", "kcat_data_merged.pkl"))
df_kcat.head(2)

Number of data points: 12806
Number of UniProt IDs: 4124


Unnamed: 0,Uniprot ID,kcat,Substrates,Products,PMID,substrate_IDs,product_IDs,checked,#UIDs,complete,from BRENDA,from Uniprot,from Sabio,EC,ORGANISM,BRENDA reaction ID
0,P20932,2.8,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1,,,
1,P20932,0.05,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,0,0,1,,,


### (e) Removing duplicated entries:

In [7]:
df_kcat = pd.read_pickle(join("..", "..", "data", "kcat_data", "kcat_data_merged.pkl"))

#### Searching for identitcal pairs of UniProt IDs and kcat values:

In [8]:
droplist = []

for ind in df_kcat.index:
    UID, kcat = df_kcat["Uniprot ID"][ind], df_kcat["kcat"][ind]
    help_df = df_kcat.loc[df_kcat["Uniprot ID"] == UID].loc[df_kcat["kcat"] == kcat]
    
    df_kcat["from BRENDA"][ind], df_kcat["from Uniprot"][ind], df_kcat["from Sabio"][ind] = max(help_df["from BRENDA"]), max(help_df["from Uniprot"]), max(help_df["from Sabio"])
    df_kcat["checked"][ind] = any(help_df["checked"])
    
    if len(help_df) > 1:
        droplist = droplist + list(help_df.index)[1:]

In [9]:
df_kcat.drop(list(set(droplist)), inplace = True)
print("Dropping %s data points, because they are duplicated." % len(set(droplist)))

Dropping 1050 data points, because they are duplicated.


## 2. Downloading amino acid sequences for all data points:

### (a) Downloading sequences via UniProt IDs:

Creating a txt file with all Uniprot IDs

In [10]:
IDs = list(set(df_kcat["Uniprot ID"]))    
f = open(join("..", "..", "data", "enzyme_data", "UNIPROT_IDs.txt"), "w") 
for ID in list(set(IDs)):
    f.write(str(ID) + "\n")
f.close()

Mapping Uniprot IDs to sequences via the UniProt mapping service and saving the results in the file "UNIPROT_results.tab"

In [11]:
UNIPROT_df = pd.read_csv(join("..", "..", "data", "enzyme_data",  "UNIPROT_results.tab"), sep = "\t")
UNIPROT_df.drop(columns = ["Entry"], inplace = True)
display(UNIPROT_df.head())

df_kcat = df_kcat.merge(UNIPROT_df, how = "left", on = "Uniprot ID")
df_kcat = df_kcat.loc[~pd.isnull(df_kcat["Uniprot ID"])]

Unnamed: 0,Sequence,Uniprot ID
0,MAKEIVKELLPLIRVYKDGSVERLLSSENVAASPEDPQTGVSSKDI...,Q5NUF3
1,MKTRKGIILAGGSGTRLYPVTMAVSKQLLPIYDKPMIYYPLSTLML...,Q9F7K6
2,MVLCPVIGKLLHKRVVLASASPRRQEILSNAGLRFEVVPSKFKEKL...,O95671
3,MSPQTETKASVGFKAGVKDYKLTYYTPEYETQDTDILAAFRVSPQP...,A0A1C3HPN5
4,MTPLLELKDIRRSYPAGDEQVEVLKGISLDIYAGEMVAIVGASGSG...,P75831


In [12]:
print("Number of different amino acid sequences in the dataset: %s" % len(set(df_kcat["Sequence"])))

Number of different amino acid sequences in the dataset: 3948


## 3. Mapping all substrates and products to InChI strings: 
#### Most of the metabolites in our dataset have InChI strings as identifiers and some of them have KEGG Compound IDs. We are trying to map the KEGG Compound IDs to InChI strings as well:

#### (a) Getting an InChI string for all metabolites

In [13]:
kegg_con = KEGG()
chebi_con = ChEBI()

met_IDs = []

for ind in df_kcat.index:
    sub_IDs, pro_IDs = df_kcat["substrate_IDs"][ind], df_kcat["product_IDs"][ind]
    if sub_IDs != "" and pro_IDs != "" :
        try:
            met_IDs = met_IDs +sub_IDs + pro_IDs
        except TypeError:
            pass
        
df_metabolites = pd.DataFrame(data = {"metabolite ID": list(set(met_IDs))})
df_metabolites = df_metabolites.loc[df_metabolites["metabolite ID"] != ""]
df_metabolites["InChI"] = np.nan

for ind in df_metabolites.index:
    met = df_metabolites["metabolite ID"][ind]
    if met[0:5] == "InChI":
        df_metabolites["InChI"][ind] = met
    else:
        try:
            kegg_entry = kegg_con.parse(kegg_con.get(met))
            chebi_entry = chebi_con.getCompleteEntity('CHEBI:' + kegg_entry['DBLINKS']['ChEBI'])
            df_metabolites["InChI"][ind] = chebi_entry.inchi
        except:
            pass
        
df_metabolites.head()

Unnamed: 0,metabolite ID,InChI
1,"InChI=1S/C10H15N4O15P3/c15-5-3(1-26-31(22,23)2...","InChI=1S/C10H15N4O15P3/c15-5-3(1-26-31(22,23)2..."
2,InChI=1S/C6H14N4O2/c7-4(5(11)12)2-1-3-10-6(8)9...,InChI=1S/C6H14N4O2/c7-4(5(11)12)2-1-3-10-6(8)9...
3,"InChI=1S/C5H11O8P/c6-3-2(1-12-14(9,10)11)13-5(...","InChI=1S/C5H11O8P/c6-3-2(1-12-14(9,10)11)13-5(..."
4,C00283,InChI=1S/H2S/h1H2
5,InChI=1S/C8H12N2O2/c1-5-8(12)7(2-9)6(4-11)3-10...,InChI=1S/C8H12N2O2/c1-5-8(12)7(2-9)6(4-11)3-10...


In [14]:
for ind in df_metabolites.index:
    if pd.isnull(df_metabolites["InChI"][ind]):
        try:
            mol = Chem.MolFromMolFile(join("..", "..", "data", "metabolite_data",
                                           "mol-files", df_metabolites["metabolite ID"][ind] + '.mol'))
            df_metabolites["InChI"][ind] = Chem.MolToInchi(mol)
        except:
            pass
        
df_metabolites = df_metabolites.loc[~pd.isnull(df_metabolites["InChI"])]

[10:23:12] ERROR: Unknown element(s): *

[10:23:12] ERROR: Unknown element(s): *



[10:23:12] ERROR: Unknown element(s): *

[10:23:12] ERROR: Unknown element(s): *


[10:23:12] ERROR: Unknown element(s): *


[10:23:12] ERROR: Unknown element(s): *


[10:23:12] ERROR: Unsupported in this mode element '*'




[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *


[10:23:13] ERROR: Unknown element(s): *


[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *


[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *



[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *


[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown element(s): *

[10:23:13] ERROR: Unknown el

#### (b) Mapping the InChI strings for all substrates and all products to the kcat values:

In [15]:
df_kcat["substrate_InChI_set"] = ""
df_kcat["product_InChI_set"] = ""

for ind in df_kcat.index:
    sub_IDs, pro_IDs = df_kcat["substrate_IDs"][ind], df_kcat["product_IDs"][ind]
    
    try:
        sub_inchis = []
        pro_inchis = []
        for sub in sub_IDs:
            inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == sub])[0]
            sub_inchis.append(inchi)
        for pro in pro_IDs:
            inchi = list(df_metabolites["InChI"].loc[df_metabolites["metabolite ID"] == pro])[0]
            pro_inchis.append(inchi)

        df_kcat["substrate_InChI_set"][ind] = set(sub_inchis)
        df_kcat["product_InChI_set"][ind] = set(pro_inchis)
    except:
        pass

## 4. Assigning  IDs to every unique sequence and to every unique reaction in the dataset:

#### (a) Creating DataFrames for all sequences and for all reactions:

In [16]:
df_sequences = pd.DataFrame(data = {"Sequence" : list(set(df_kcat["Sequence"]))})
df_sequences = df_sequences.loc[~pd.isnull(df_sequences["Sequence"])]
df_sequences.reset_index(inplace = True, drop = True)
df_sequences["Sequence ID"] = ["Sequence_" + str(ind) for ind in df_sequences.index]

df_sequences

Unnamed: 0,Sequence,Sequence ID
0,MLALLLSPYGAYLGLALLVLYYLLPYLKRAHLRDIPAPGLAAFTNF...,Sequence_0
1,MFRPSIKFKQSTLSIIARRLKSSAQHQPLRSSFVPSPIVFVAGLAV...,Sequence_1
2,MKKQNDIPQPIRGDKGATVKIPRNIERDRQNPDMLVPPETDHGTVS...,Sequence_2
3,MERTELLKPRTLADLIRILHELFAGDEVNVEEVQAVLEAYESNPAE...,Sequence_3
4,MKRNFKMHLPNPHKQHPKVSKRAWISETALIIGNVSIADDVFVGPN...,Sequence_4
...,...,...
3942,MTENLQDMFESSYRGEAPEQLAARPPWSIGQPQPEILKLIEQGKVH...,Sequence_3942
3943,MDSSVIQRKKVAVIGGGLVGSLQACFLAKRNFQIDVYEAREDTRVA...,Sequence_3943
3944,MPMISCDMRYGRTDEQKRALSAGLLRVISEATGEPRENIFFVIREG...,Sequence_3944
3945,MSPQTETKASVGFKAGVKDYKLTYYTPEYETKDTDILAAFRVTPQP...,Sequence_3945


In [17]:
df_reactions = pd.DataFrame({"substrates": df_kcat["substrate_InChI_set"],
                            "products" : df_kcat["product_InChI_set"]})
df_reactions = df_reactions.loc[df_reactions["substrates"] != set([])]
df_reactions = df_reactions.loc[df_reactions["products"] != set([])]


droplist = []
for ind in df_reactions.index:
    sub_IDs, pro_IDs = df_reactions["substrates"][ind], df_reactions["products"][ind]
    help_df = df_reactions.loc[df_reactions["substrates"] == sub_IDs].loc[df_reactions["products"] == pro_IDs]
    if len(help_df):
        for ind in list(help_df.index)[1:]:
            droplist.append(ind)
            
df_reactions.drop(list(set(droplist)), inplace = True)
df_reactions.reset_index(inplace = True, drop =True)

df_reactions["Reaction ID"] = ["Reaction_" + str(ind) for ind in df_reactions.index]
df_reactions

Unnamed: 0,substrates,products,Reaction ID
0,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,Reaction_0
1,"{InChI=1S/O2/c1-2, InChI=1S/C17H23N4O9P/c1-7-3...",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_1
2,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,"{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...",Reaction_2
3,{InChI=1S/C16H28N2O11/c1-5(21)17-9-13(25)14(8(...,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,Reaction_3
4,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",Reaction_4
...,...,...,...
4434,{InChI=1S/C34H58N7O21P3S/c1-18(58-33-21(43)13-...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C34H56N7O2...",Reaction_4434
4435,{InChI=1S/C11H19NO8/c1-4(10(16)17)19-9-7(12-5(...,{InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15...,Reaction_4435
4436,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,{InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15...,Reaction_4436
4437,{InChI=1S/C16H12O4/c1-19-12-5-2-10(3-6-12)14-9...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_4437


#### (b) Calcuating the sum of the molecular weights of all substrates and of all products:

In [18]:
df_reactions["MW_frac"] = np.nan

for ind in df_reactions.index:
    substrates = list(df_reactions["substrates"][ind])
    products = list(df_reactions["products"][ind])
    
    mw_subs = mw_mets(metabolites = substrates)
    mw_pros = mw_mets(metabolites = products)
    if mw_pros != 0:
        df_reactions["MW_frac"][ind] = mw_subs/mw_pros
    else:
        df_reactions["MW_frac"][ind] = np.inf
        
df_reactions












































Unnamed: 0,substrates,products,Reaction ID,MW_frac
0,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,Reaction_0,1.000000
1,"{InChI=1S/O2/c1-2, InChI=1S/C17H23N4O9P/c1-7-3...",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_1,1.000000
2,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,"{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...",Reaction_2,1.000000
3,{InChI=1S/C16H28N2O11/c1-5(21)17-9-13(25)14(8(...,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,Reaction_3,2.000000
4,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",Reaction_4,1.000000
...,...,...,...,...
4434,{InChI=1S/C34H58N7O21P3S/c1-18(58-33-21(43)13-...,"{InChI=1S/H2O2/c1-2/h1-2H, InChI=1S/C34H56N7O2...",Reaction_4434,1.000000
4435,{InChI=1S/C11H19NO8/c1-4(10(16)17)19-9-7(12-5(...,{InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15...,Reaction_4435,1.000000
4436,{InChI=1S/C8H15NO6/c1-3(11)9-5-7(13)6(12)4(2-1...,{InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15...,Reaction_4436,1.000000
4437,{InChI=1S/C16H12O4/c1-19-12-5-2-10(3-6-12)14-9...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,Reaction_4437,0.998668


#### (c) Mapping Sequence and Reaction IDs to kcat_df:

In [19]:
df_kcat = df_kcat.merge(df_sequences, on = "Sequence", how = "left")

In [20]:
df_reactions.rename(columns = {"substrates" : "substrate_InChI_set",
                              "products" : "product_InChI_set"}, inplace = True)

df_kcat["Reaction ID"] = np.nan
df_kcat["MW_frac"] = np.nan
for ind in df_kcat.index:
    sub_set, pro_set = df_kcat["substrate_InChI_set"][ind], df_kcat["product_InChI_set"][ind]
    
    help_df = df_reactions.loc[df_reactions["substrate_InChI_set"] == sub_set].loc[df_reactions["product_InChI_set"] == pro_set]
    if len(help_df) == 1:
        df_kcat["Reaction ID"][ind] = list(help_df["Reaction ID"])[0]
        df_kcat["MW_frac"][ind] = list(help_df["MW_frac"])[0]
df_kcat.head(2)

Unnamed: 0,Uniprot ID,kcat,Substrates,Products,PMID,substrate_IDs,product_IDs,checked,#UIDs,complete,...,from Sabio,EC,ORGANISM,BRENDA reaction ID,Sequence,substrate_InChI_set,product_InChI_set,Sequence ID,Reaction ID,MW_frac
0,P20932,2.8,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,...,1,,,,MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,Sequence_94,Reaction_0,1.0
1,P20932,0.05,(S)-Mandelate;Riboflavin-5-phosphate,Reduced FMN;alpha-Oxo-benzeneacetic acid,15311930,"[C01984, C00061]","[C02137, C01847]",False,1,True,...,1,,,,MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,Sequence_94,Reaction_0,1.0


#### (d) Creating a new DataFrame with one entry for every unique sequence-reaction pair:

##### (d)(i) Creating the DataFrame:

In [21]:
df_kcat_new = pd.DataFrame(data = {"Reaction ID" : df_kcat["Reaction ID"],
                                  "Sequence ID" : df_kcat["Sequence ID"]})
df_kcat_new = df_kcat_new.loc[~pd.isnull(df_kcat_new["Reaction ID"])].loc[~pd.isnull(df_kcat_new["Sequence ID"])]
df_kcat_new.drop_duplicates(inplace = True)
df_kcat_new.reset_index(inplace = True, drop = True)


df_kcat_new["kcat_values"], df_kcat_new["Uniprot IDs"] = "", ""
df_kcat_new["from_BRENDA"], df_kcat_new["from_Sabio"], df_kcat_new["from_Uniprot"] = "", "", ""
df_kcat_new["checked"] = ""

for ind in df_kcat_new.index:
    RID, SID = df_kcat_new["Reaction ID"][ind], df_kcat_new["Sequence ID"][ind]
    help_df = df_kcat.loc[df_kcat["Reaction ID"] == RID].loc[df_kcat["Sequence ID"] == SID]
    
    df_kcat_new["kcat_values"][ind] = list(help_df["kcat"])
    df_kcat_new["Uniprot IDs"][ind] = list(help_df["Uniprot ID"])
    df_kcat_new["from_BRENDA"][ind] = list(help_df["from BRENDA"])
    df_kcat_new["from_Sabio"][ind] = list(help_df["from Sabio"])
    df_kcat_new["from_Uniprot"][ind] = list(help_df["from Uniprot"])
    df_kcat_new["checked"][ind] = list(help_df["checked"])
df_kcat_new

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked
0,Reaction_0,Sequence_94,"[2.8, 0.05, 0.11, 205.0, 2.3, 134.0, 360.0]","[P20932, P20932, P20932, P20932, P20932, P2093...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, False]"
1,Reaction_1,Sequence_94,"[1.2, 3.4, 0.61, 0.07]","[P20932, P20932, P20932, P20932]","[0, 0, 0, 0]","[1, 1, 1, 1]","[0, 0, 0, 0]","[False, False, False, False]"
2,Reaction_2,Sequence_920,"[6.18, 14.5, 11.58, 13.12, 11.9, 13.98, 14.08,...","[P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal..."
3,Reaction_4,Sequence_3142,"[57.1, 19.6, 5.96, 13.6, 26.4, 14.0, 41.1, 11....","[P25704, P25704, P25704, P25704, P25704, P2570...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, Fal..."
4,Reaction_5,Sequence_3626,"[2.98, 0.87]","[P04746, P04746]","[0, 0]","[1, 1]","[0, 0]","[False, False]"
...,...,...,...,...,...,...,...,...
7491,Reaction_4437,Sequence_78,[0.015],[Q6WNR0],[0],[0],[1],[False]
7492,Reaction_3888,Sequence_2870,[0.23],[P90994],[0],[0],[1],[False]
7493,Reaction_3888,Sequence_538,[1.0],[O16228],[0],[0],[1],[False]
7494,Reaction_4408,Sequence_469,"[5.3, 149.0]","[O06644, O06644]","[0, 0]","[0, 0]","[1, 1]","[False, False]"


##### (d)(ii): Adding sequence, substrates, and products to all data points

In [22]:
df_kcat_new["Sequence"], df_kcat_new["substrates"], df_kcat_new["products"], df_kcat_new["MW_frac"] = "", "", "", ""

for ind in df_kcat_new.index:
    RID, SID = df_kcat_new["Reaction ID"][ind], df_kcat_new["Sequence ID"][ind]
    help_df = df_reactions.loc[df_reactions["Reaction ID"] == RID]
    df_kcat_new["substrates"][ind], df_kcat_new["products"][ind] = list(help_df["substrate_InChI_set"])[0], list(help_df["product_InChI_set"])[0]
    df_kcat_new["MW_frac"][ind] = list(help_df["MW_frac"])[0]
    
    help_df = df_sequences.loc[df_sequences["Sequence ID"] == SID]
    df_kcat_new["Sequence"][ind] = list(help_df["Sequence"])[0]
    

##### (d)(iii) Calculating the maximal kcat value for every sequence and for every reaction:

In [23]:
df_all_kcat = pd.read_pickle(join("..", "..", "data", "kcat_data", "kcat_data_merged.pkl"))
df_all_kcat.head()

df_kcat_new["max_kcat_for_UID"]  = ""
df_kcat_new["max_kcat_for_RID"]  = ""

for ind in df_kcat_new.index:
    max_kcat = - np.inf
    UIDs = list(set(df_kcat_new["Uniprot IDs"][ind]))
    for UID in UIDs:
        all_kcat = list(df_all_kcat["kcat"].loc[df_all_kcat["Uniprot ID"] == UID])
        all_kcat = [float(kcat) for kcat in all_kcat]
        max_kcat = max(max_kcat, max(all_kcat))
    df_kcat_new["max_kcat_for_UID"][ind] = max_kcat


for ind in df_kcat_new.index:
    RID = df_kcat_new["Reaction ID"][ind]
    
    help_df = df_kcat_new.loc[df_kcat_new["Reaction ID"] == RID]
    all_kcat = []
    for ind2 in help_df.index:
        all_kcat = all_kcat + list(help_df["kcat_values"][ind2])
    all_kcat = [float(kcat) for kcat in all_kcat]
    max_kcat = max(all_kcat)
    df_kcat_new["max_kcat_for_RID"][ind] = max_kcat
df_kcat_new.head()

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,products,MW_frac,max_kcat_for_UID,max_kcat_for_RID
0,Reaction_0,Sequence_94,"[2.8, 0.05, 0.11, 205.0, 2.3, 134.0, 360.0]","[P20932, P20932, P20932, P20932, P20932, P2093...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,1.0,360.0,360.0
1,Reaction_1,Sequence_94,"[1.2, 3.4, 0.61, 0.07]","[P20932, P20932, P20932, P20932]","[0, 0, 0, 0]","[1, 1, 1, 1]","[0, 0, 0, 0]","[False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,"{InChI=1S/O2/c1-2, InChI=1S/C17H23N4O9P/c1-7-3...",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,1.0,360.0,3.4
2,Reaction_2,Sequence_920,"[6.18, 14.5, 11.58, 13.12, 11.9, 13.98, 14.08,...","[P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,"{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...",1.0,29.0,29.0
3,Reaction_4,Sequence_3142,"[57.1, 19.6, 5.96, 13.6, 26.4, 14.0, 41.1, 11....","[P25704, P25704, P25704, P25704, P25704, P2570...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, Fal...",MAMQKIFAREILDSRGNPTVEVDLHTAKGRFRAAVPSGASTGIYEA...,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",1.0,57.1,84.0
4,Reaction_5,Sequence_3626,"[2.98, 0.87]","[P04746, P04746]","[0, 0]","[1, 1]","[0, 0]","[False, False]",MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERY...,{InChI=1S/C18H32O16/c19-1-4-7(22)8(23)12(27)17...,{InChI=1S/C12H22O11/c13-1-3-5(15)6(16)9(19)12(...,1.0,2.98,2.98


##### (d)(iv) Calculating the maximal kcat value for every EC number in the dataset:

In [24]:
df_kcat = df_kcat_new.copy()

#Using the txt file and the Uniprot mapping service to get an EC number for every enzyme:
df_EC = pd.read_csv(join("..", "..", "data", "enzyme_data", "Uniprot_results_EC.tab"), sep = "\t")
df_EC.head()

Unnamed: 0,Entry,EC number,Uniprot ID
0,P05091,1.2.1.3,P05091
1,Q5L1B7,1.7.1.13,Q5L1B7
2,A0A1C3HPT0,4.1.1.39,A0A1C3HPT0
3,D0RZL3,1.14.12.10,D0RZL3
4,D0RZL4,1.14.12.10,D0RZL4


In [25]:
df_kcat.head()
df_kcat["ECs"] = ""
for ind in df_kcat.index:
    UID = df_kcat["Uniprot IDs"][ind][0]
    try:
        df_kcat["ECs"][ind] = list(df_EC["EC number"].loc[df_EC["Uniprot ID"] == UID])[0].split("; ")
    except:
        df_kcat["ECs"][ind] = []
df_kcat.head(2)

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,products,MW_frac,max_kcat_for_UID,max_kcat_for_RID,ECs
0,Reaction_0,Sequence_94,"[2.8, 0.05, 0.11, 205.0, 2.3, 134.0, 360.0]","[P20932, P20932, P20932, P20932, P20932, P2093...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,1.0,360.0,360.0,[1.1.99.31]
1,Reaction_1,Sequence_94,"[1.2, 3.4, 0.61, 0.07]","[P20932, P20932, P20932, P20932]","[0, 0, 0, 0]","[1, 1, 1, 1]","[0, 0, 0, 0]","[False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,"{InChI=1S/O2/c1-2, InChI=1S/C17H23N4O9P/c1-7-3...",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,1.0,360.0,3.4,[1.1.99.31]


In [26]:
all_ECs = []
for ind in df_kcat.index:
    all_ECs = all_ECs + df_kcat["ECs"][ind]

all_ECs = list(set(all_ECs))

df_EC_kcat = pd.DataFrame({"EC" : all_ECs})
df_EC_kcat["max_kcat"] = np.nan

for ind in df_EC_kcat.index:
    try:
        kcat_max = get_max_for_EC_number(EC = df_EC_kcat["EC"][ind])
        df_EC_kcat["max_kcat"][ind] = kcat_max
        print(ind, kcat_max)
    except:
        pass
    
df_EC_kcat.to_pickle(join("..", "..", "data", "enzyme_data", "df_EC_max_kcat.pkl"))

0 16.7
1 667.0
2 2.5
3 38.4
4 158.0
5 nan
6 0.35
7 551.0
9 1.9
10 35.7
11 1570.0
12 nan
13 122.0
14 834.0
15 845.0
16 8.9
17 2.5
18 380.0
19 2208.0
20 5721.0
21 22.0
22 158.4
23 514.0
24 nan
25 nan
26 3.8
27 nan
28 130.0
29 1349.0
30 816.0
31 nan
32 19.4
33 1612.0
34 20.0
35 4080.0
36 200000.0
37 3.67
38 11.6
39 238.0
40 2200.0
41 2.8
42 1.3
43 115.0
44 8.82
45 636.0
46 412000.0
47 83.3
48 0.92
49 18.9
50 13.2
51 nan
52 0.019
53 7.26
54 70.9
55 1550.0
56 0.03
57 1.7
58 2.2
59 1414.0
60 nan
61 22.7
62 0.0475
63 1.7
64 70.0
65 79.0
66 761.0
67 187.5
68 3.6
69 0.0833
70 4410.0
71 770.0
72 nan
73 0.11
74 nan
75 0.072
76 443.8
77 0.038
78 nan
79 654.0
80 1214290.0
81 nan
82 120.0
83 nan
84 0.72
85 88.0
86 128.0
87 78300.0
88 536.0
89 2.97
90 0.83
91 nan
92 9400.0
93 246.0
94 1309.0
95 nan
96 7.18
97 4.853
98 2633.0
99 4.2
100 124.5
101 585.0
102 366000.0
103 4.7
104 1.7
105 467.0
106 nan
107 12.9
108 0.071
109 1800.0
110 8610.0
111 27.0
112 6.8
113 0.18
114 nan
115 8.7
116 330.0
117 5950.0


Mapping max EC kcat value to all data points:

In [27]:
df_EC_kcat = pd.read_pickle(join("..", "..", "data", "enzyme_data", "df_EC_max_kcat.pkl"))
df_kcat["max_kcat_for_EC"] = np.nan

for ind in df_kcat.index:
    ECs = df_kcat["ECs"][ind]
    max_kcat = 0
    for EC in ECs:
        try:
            max_kcat = max(max_kcat, list(df_EC_kcat["max_kcat"].loc[df_EC_kcat["EC"] == EC])[0])
        except:
            pass
    if max_kcat != 0:
        df_kcat["max_kcat_for_EC"][ind] = max_kcat
df_kcat.to_pickle(join("..", "..", "data", "kcat_data", "merged_and_grouped_kcat_dataset.pkl"))     

In [28]:
df_sequences.to_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs.pkl"))
df_reactions.to_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs.pkl"))

## 5. Calculating reaction fingerprints (structural and difference) for every reaction and a ESM-1b/ESM-1b_ts vector for every amino acid sequence:

#### (a) Executing jupyter notebook A2 to calculate the reaction fingerprints and enzyme representations. Then loading the results

In [29]:
df_kcat = pd.read_pickle(join("..", "..", "data", "kcat_data", "merged_and_grouped_kcat_dataset.pkl"))     

In [30]:
df_sequences = pd.read_pickle(join("..", "..", "data", "enzyme_data", "all_sequences_with_IDs_and_ESM1b_ts.pkl"))
df_reactions = pd.read_pickle(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"))

#### (b) Mapping ESM-1b vectors and reaction fingerprints to kcat dataset:

In [31]:
df_kcat["structural_fp"], df_kcat["difference_fp"], df_kcat["ESM1b"], df_kcat["ESM1b_ts"] = "", "", "", ""

for ind in df_kcat.index:
    RID, SID = df_kcat["Reaction ID"][ind], df_kcat["Sequence ID"][ind]
    
    help_df = df_reactions.loc[df_reactions["Reaction ID"] == RID]
    df_kcat["structural_fp"][ind], df_kcat["difference_fp"][ind] = list(help_df["structural_fp"])[0], list(help_df["difference_fp"])[0]
    
    help_df = df_sequences.loc[df_sequences["Sequence ID"] == SID]
    df_kcat["ESM1b"][ind] = list(help_df["ESM1b"])[0]
    df_kcat["ESM1b_ts"][ind] = list(help_df["ESM1b_ts"])[0]
    
n = len(df_kcat)
#Remove values with missing reaction fingerprints or enzyme representation
df_kcat = df_kcat.loc[df_kcat["structural_fp"] != ""].loc[df_kcat["ESM1b"] != ""].loc[df_kcat["ESM1b_ts"] != ""]
print("Removing %s enzyme-reaction combinations because they either do not have a ESM1b vector or reaction fingerprint" % (n-len(df_kcat)))
df_kcat.to_pickle(join("..", "..", "data", "kcat_data", "merged_and_grouped_kcat_dataset_with_FPs_and_ESM1bs_ts.pkl"))
df_kcat.head()

Removing 26 enzyme-reaction combinations because they either do not have a ESM1b vector or reaction fingerprint


Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,products,MW_frac,max_kcat_for_UID,max_kcat_for_RID,ECs,max_kcat_for_EC,structural_fp,difference_fp,ESM1b,ESM1b_ts
0,Reaction_0,Sequence_94,"[2.8, 0.05, 0.11, 205.0, 2.3, 134.0, 360.0]","[P20932, P20932, P20932, P20932, P20932, P2093...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,{InChI=1S/C8H6O3/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,1.0,360.0,360.0,[1.1.99.31],550.0,1100110100001000000000110111010001000001111010...,"[0.0, 0.0, 0.0, 0.0, -10.0, 0.0, 0.0, 0.0, 0.0...","[0.14890721, 0.11512917, -0.09207664, 0.210020...","[0.26987514, -0.26356047, -0.30432516, -0.0128..."
1,Reaction_1,Sequence_94,"[1.2, 3.4, 0.61, 0.07]","[P20932, P20932, P20932, P20932]","[0, 0, 0, 0]","[1, 1, 1, 1]","[0, 0, 0, 0]","[False, False, False, False]",MSQNLFNVEDYRKLRQKRLPKMVYDYLEGGAEDEYGVKHNRDVFQQ...,"{InChI=1S/O2/c1-2, InChI=1S/C17H23N4O9P/c1-7-3...",{InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(1...,1.0,360.0,3.4,[1.1.99.31],550.0,1100010100000001010000110110000001000001111000...,"[0.0, 0.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0,...","[0.14890721, 0.11512917, -0.09207664, 0.210020...","[0.26987514, -0.26356047, -0.30432516, -0.0128..."
2,Reaction_2,Sequence_920,"[6.18, 14.5, 11.58, 13.12, 11.9, 13.98, 14.08,...","[P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ4, P0ABQ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[False, False, False, False, False, False, Fal...",MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,{InChI=1S/C19H23N7O6/c20-19-25-15-14(17(30)26-...,"{InChI=1S/p+1, InChI=1S/C21H30N7O17P3/c22-17-1...",1.0,29.0,29.0,[1.5.1.3],284.0,1100111100000001001000110110010001001101111100...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0221064, 0.11822142, 0.08944552, 0.02871313...","[0.8305805, -0.79579866, -0.023942132, -0.1448..."
3,Reaction_4,Sequence_3142,"[57.1, 19.6, 5.96, 13.6, 26.4, 14.0, 41.1, 11....","[P25704, P25704, P25704, P25704, P25704, P2570...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[False, False, False, False, False, False, Fal...",MAMQKIFAREILDSRGNPTVEVDLHTAKGRFRAAVPSGASTGIYEA...,"{InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,...","{InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(...",1.0,57.1,84.0,[4.2.1.11],230.0,1000000000000000000000000000000001000000111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05887897, 0.14318994, -0.1467509, 0.0979277...","[1.0513978, 0.60419565, -0.12464851, 0.7865547..."
4,Reaction_5,Sequence_3626,"[2.98, 0.87]","[P04746, P04746]","[0, 0]","[1, 1]","[0, 0]","[False, False]",MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERY...,{InChI=1S/C18H32O16/c19-1-4-7(22)8(23)12(27)17...,{InChI=1S/C12H22O11/c13-1-3-5(15)6(16)9(19)12(...,1.0,2.98,2.98,[3.2.1.1],40000.0,1100010100000000000000110110000001000001111000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.044303663, 0.07280991, -0.042034477, 0.065...","[-0.47599328, -0.12329675, -0.36591977, 0.8752..."


## 6.Removing outliers and non-natural reactions:

In [32]:
df_kcat = pd.read_pickle(join("..", "..", "data", "kcat_data", "merged_and_grouped_kcat_dataset_with_FPs_and_ESM1bs_ts.pkl"))

#### (a) Calculating the geometric mean and log10-transforming it for all enzyme-reaction pairs:
To ignore $k_{cat}$ values that were obtained under non-optimal conditions, we exclude values lower than 1\% than the maximal $k_{cat}$ value for the same enzyme-reaction combination.

In [33]:
df_kcat["geomean_kcat"] = np.nan
df_kcat["frac_of_max_UID"] = np.nan
df_kcat["frac_of_max_RID"] = np.nan
df_kcat["frac_of_max_EC"] = np.nan

for ind in df_kcat.index:
    all_kcat = np.array(df_kcat["kcat_values"][ind]).astype(float)
    max_kcat = max(all_kcat)
    all_kcat_top = [kcat for kcat in all_kcat  if kcat/max_kcat > 0.01]
    df_kcat["geomean_kcat"][ind] = np.mean(np.log10(all_kcat_top))
    
    df_kcat["frac_of_max_UID"][ind] =  np.max(np.array(df_kcat["kcat_values"][ind]).astype(float))/df_kcat["max_kcat_for_UID"][ind]
    df_kcat["frac_of_max_RID"][ind] =  np.max(np.array(df_kcat["kcat_values"][ind]).astype(float))/df_kcat["max_kcat_for_RID"][ind]
    df_kcat["frac_of_max_EC"][ind] = np.max(np.array(df_kcat["kcat_values"][ind]).astype(float))/df_kcat["max_kcat_for_EC"][ind]
df_kcat = df_kcat.loc[~pd.isnull(df_kcat["geomean_kcat"])]

len(df_kcat)

7470

#### (b) We are only interested in kcat values that were measured for the natural reaction of an enzyme:
To achieve this we exclude kcat values for an enzyme if another measurement exists for the same enzyme but for different reaction with a kcat value that is more than ten times higher. Furthermore, to exlcude data points measured under non-optimal conditions and for non-natural reactions, we exclude kcat values if we could find a kcat value for the same reaction or same EC number that is more than 100 times higher.

In [34]:
n = len(df_kcat)

df_kcat = df_kcat.loc[df_kcat["frac_of_max_UID"] > 0.1]
df_kcat = df_kcat.loc[df_kcat["frac_of_max_RID"] > 0.01]

df_kcat["frac_of_max_EC"].loc[pd.isnull(df_kcat["frac_of_max_EC"])] = 1
df_kcat = df_kcat.loc[df_kcat["frac_of_max_EC"] < 10]
df_kcat = df_kcat.loc[df_kcat["frac_of_max_EC"] > 0.01]

In [35]:
print("We remove %s data points, because we suspect that these kcat values were not measure for the natural reaction " \
    "of an enzyme or under non-optimal conditions." % (n-len(df_kcat)))

We remove 3103 data points, because we suspect that these kcat values were not measure for the natural reaction of an enzyme or under non-optimal conditions.


#### (c) Removing data points with reaction queations with uneven fraction of molecular weights

In [36]:
n = len(df_kcat)

df_kcat = df_kcat.loc[df_kcat["MW_frac"] < 3]
df_kcat = df_kcat.loc[df_kcat["MW_frac"] > 1/3]

print("We remove %s data points because the sum of molecular weights of substrates does not match the sum of molecular" \
      "weights of the products." % (n-len(df_kcat)))

We remove 52 data points because the sum of molecular weights of substrates does not match the sum of molecularweights of the products.


#### (d) Removing data points with outlying kcat values:

In [37]:
n = len(df_kcat)

df_kcat = df_kcat.loc[~(df_kcat["geomean_kcat"]>5)]
df_kcat = df_kcat.loc[~(df_kcat["geomean_kcat"]<-2.5)]

print("We remove %s data point because their kcat values are outliers." % (n-len(df_kcat)))

We remove 55 data point because their kcat values are outliers.


In [38]:
print("Size of final kcat dataset: %s" % len(df_kcat))
df_kcat.to_pickle(join("..", "..", "data", "kcat_data", "final_kcat_dataset.pkl"))

Size of final kcat dataset: 4260


## 7. Splitting the dataset into training and test set:

#### (a) Splitting the dataset in such a way that the same enzyme does not occur in the training and the test set:

Shuffling DataFrame:

In [39]:
df = df_kcat.copy()
df = df.sample(frac = 1, random_state = 123)
df.reset_index(drop= True, inplace = True)

Splitting dataset

In [40]:
train_df, test_df = split_dataframe_enzyme(frac = 5, df = df.copy())
print("Test set size: %s" % len(test_df))
print("Training set size: %s" % len(train_df))
print("Size of test set in percent: %s" % np.round(100*len(test_df)/ (len(test_df) + len(train_df))))


train_df.reset_index(inplace = True, drop = True)
test_df.reset_index(inplace = True, drop = True)

train_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
test_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat.pkl"))

Test set size: 869
Training set size: 3391
Size of test set in percent: 20.0


#### (b) Splitting the training set into 5 folds for 5-fold cross-validations (CVs):
In order to achieve a model that generalizes well during CV, we created the 5 folds in such a way that neither the same enzyme nor the same reaction occurs in two different subsets.

In [41]:
train_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
test_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat.pkl"))

In [42]:
data_train2 = train_df.copy()
data_train2["index"] = list(data_train2.index)

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=5)
indices_fold1 = list(df_fold["index"])
print(len(data_train2), len(indices_fold1))#

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=4)
indices_fold2 = list(df_fold["index"])
print(len(data_train2), len(indices_fold2))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=3)
indices_fold3 = list(df_fold["index"])
print(len(data_train2), len(indices_fold3))

data_train2, df_fold = split_dataframe_enzyme(df = data_train2, frac=2)
indices_fold4 = list(df_fold["index"])
indices_fold5 = list(data_train2["index"])
print(len(data_train2), len(indices_fold4))


fold_indices = [indices_fold1, indices_fold2, indices_fold3, indices_fold4, indices_fold5]

train_indices = [[], [], [], [], []]
test_indices = [[], [], [], [], []]

for i in range(5):
    for j in range(5):
        if i != j:
            train_indices[i] = train_indices[i] + fold_indices[j]
    test_indices[i] = fold_indices[i]
    
    
np.save(join("..", "..", "data", "kcat_data", "splits", "CV_train_indices"), train_indices)
np.save(join("..", "..", "data", "kcat_data", "splits", "CV_test_indices"), test_indices)

2668 723
2000 668
1326 674
680 646


### Adding DRFPs:

In [43]:
train_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
test_df = pd.read_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat.pkl"))

In [45]:
import pickle5 as p
df_reaction = p.load(open(join("..", "..", "data", "reaction_data", "all_reactions_with_IDs_and_FPs.pkl"), "rb"))

In [46]:
train_df["DRFP"] = [list(df_reaction["DRFP"].loc[df_reaction["Reaction ID"] == R_ID])[0] for R_ID in train_df["Reaction ID"]]
test_df["DRFP"] = [list(df_reaction["DRFP"].loc[df_reaction["Reaction ID"] == R_ID])[0] for R_ID in test_df["Reaction ID"]]

In [47]:
train_df.head(1)

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,...,max_kcat_for_EC,structural_fp,difference_fp,ESM1b,ESM1b_ts,geomean_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,DRFP
0,Reaction_3749,Sequence_432,[2.0],[Q1NAJ2],[0],[0],[1],[False],MPKIIDAKVIITCPGRNFVTLKIMTDEGVYGLGDATLNGRELAVAS...,{InChI=1S/C6H12O7/c7-1-2(8)3(9)4(10)5(11)6(12)...,...,5.88,1100000000000000000000000000000001000000001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.009050991, 0.050203744, -0.014417422, 0.020...","[0.8609361, -0.39274246, -0.08324536, 1.029622...",0.30103,1.0,0.5,0.340136,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
sequences = df_kcat[["Sequence ID","Sequence"]].drop_duplicates().sort_values(by = "Sequence ID").reset_index(drop=True)
sequences = dict(zip(sequences["Sequence ID"].tolist(),sequences["Sequence"].tolist()))
with open("../../data/seq.fasta","w") as out_file:
    for name in sequences.keys():
        out_file.write(f">{name}\n{sequences[name]}\n")

In [50]:
def get_embedding(protein):
    return load(f"../../data/esm_embeddings/{protein}.pt")["mean_representations"][36].tolist()

train_df["ESM2_3B"] = train_df["Sequence ID"].apply(lambda x: get_embedding(x))
test_df["ESM2_3B"] = test_df["Sequence ID"].apply(lambda x: get_embedding(x))

In [51]:
train_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "train_df_kcat.pkl"))
test_df.to_pickle(join("..", "..", "data", "kcat_data", "splits", "test_df_kcat.pkl"))

In [53]:
train_df.head(1)

Unnamed: 0,Reaction ID,Sequence ID,kcat_values,Uniprot IDs,from_BRENDA,from_Sabio,from_Uniprot,checked,Sequence,substrates,...,structural_fp,difference_fp,ESM1b,ESM1b_ts,geomean_kcat,frac_of_max_UID,frac_of_max_RID,frac_of_max_EC,DRFP,ESM2_3B
0,Reaction_3749,Sequence_432,[2.0],[Q1NAJ2],[0],[0],[1],[False],MPKIIDAKVIITCPGRNFVTLKIMTDEGVYGLGDATLNGRELAVAS...,{InChI=1S/C6H12O7/c7-1-2(8)3(9)4(10)5(11)6(12)...,...,1100000000000000000000000000000001000000001000...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.009050991, 0.050203744, -0.014417422, 0.020...","[0.8609361, -0.39274246, -0.08324536, 1.029622...",0.30103,1.0,0.5,0.340136,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.029736047610640526, -0.031472790986299515,..."
