In [66]:
import pandas as pd
import numpy as np
from indigo import Indigo
import pickle
from pathlib import Path
from openpyxl import load_workbook
from rdkit import Chem

# Inputs

### A dictionary mapping the excel substrates to SMILES

In [71]:
name_2_smiles_orig = {
    "(2E,6E)-FPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "(2E,6E,10E)-GGPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "GGPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "(+)-copalyl diphosphate": "[H][C@@]12CCC(=C)[C@H](CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O)[C@@]1(C)CCCC2(C)C",
    "(2E,6E)-FPP; isopentenyl PP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "(2E)-GPP; isopentenyl PP": "CC(C)=CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "dimethylallyl PP; isopentenyl PP": "CC(=CCOP(=O)([O-])OP(=O)([O-])[O-])C.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "(2E)-GPP": "CC(C)=CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "ent-copalyl diphosphate": "[C@@H]1(CC/C(/C)=C/COP(OP(=O)([O-])[O-])(=O)[O-])C(=C)CC[C@]2([C@@]1(C)CCCC2(C)C)[H]",
    "(2Z,6E)-FPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C/COP([O-])(=O)OP([O-])([O-])=O",
    "(S)-2,3-epoxysqualene": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C=C(/C)CC\\C=C(/C)CC[C@@H]1OC1(C)C",
    "peregrinol PP": "[C@@]1(CC/C(/C)=C/COP(OP(=O)([O-])[O-])(=O)[O-])([C@@H](CC[C@@]2([C@]1(C)CCCC2(C)C)[H])C)O",
    "copal-8-ol diphosphate(3−)": "[H][C@@]12CC[C@@](C)(O)[C@H](CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O)[C@@]1(C)CCCC2(C)C",
    "NPP": "CC(=CCC/C(=C\\COP(=O)(O)OP(=O)(O)O)/C)C",
    "9α-copalyl PP": "[H][C@@]12CCC(=C)[C@@H](CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O)[C@@]1(C)CCCC2(C)C",
    "(2Z,6Z)-FPP": "CC(C)=CCC\\C(C)=C/CC\\C(C)=C/COP([O-])(=O)OP([O-])([O-])=O",
    "ent-copal-8-ol diphosphate(3−)": "[C@@H]1(CC/C(/C)=C/COP(OP(=O)([O-])[O-])(=O)[O-])[C@](CC[C@]2([C@@]1(C)CCCC2(C)C)[H])(O)C",
    "9α-copalyl diphosphate": "[H][C@@]12CCC(=C)[C@@H](CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O)[C@@]1(C)CCCC2(C)C",
    "(2E,6E,10E,14E)-GFPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "(R)-tetraprenyl-β-curcumene": "C[C@H](CC\\C=C(/C)CC\\C=C(/C)CC\\C=C(/C)CC\\C=C(/C)CCC=C(C)C)C1=CCC(C)=CC1",
    "(E)-2-MeGPP": "CC(C)=CCC\\C(C)=C(/C)COP([O-])(=O)OP([O-])([O-])=O",
    "dimethylallyl PP; 4 isopentenyl PP": "CC(=C)CCOP(=O)(O)OP(=O)(O)O.CC(=C)CCOP(=O)(O)OP(=O)(O)O.CC(=C)CCOP(=O)(O)OP(=O)(O)O.CC(=C)CCOP(=O)(O)OP(=O)(O)O.CC(=CCOP(=O)([O-])OP(=O)([O-])[O-])C",
    "(3S,22S)-2,3:22,23-diepoxy-2,3,22,23-tetrahydrosqualene": "C(C/C=C(/CC/C=C(/CC[C@H]1C(O1)(C)C)\\C)\\C)/C=C(/CC/C=C(/CC[C@H]2C(O2)(C)C)\\C)\\C",
    "pre-α-onocerin": "C1C[C@@H](C([C@]2([C@]1([C@H](C(CC2)=C)CC/C=C(/CC/C=C(/CC[C@H]3C(O3)(C)C)\\C)\\C)C)[H])(C)C)O",
    "dimethylallyl PP": "CC(=CCOP(=O)([O-])OP(=O)([O-])[O-])C",
    "(R,R)-chrysanthemyl diphosphate": "CC(C)=C[C@@H]1[C@@H](COP([O-])(=O)OP([O-])([O-])=O)C1(C)C",
    "(R)-lavandulyl diphosphate(3−)": "C(=CC[C@@H](COP([O-])(=O)OP(=O)([O-])[O-])C(C)=C)(C)C",
    "PPP": "CC(C)CCC[C@@H](C)CCC[C@@H](C)CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "HexPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "HepPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "presqualene PP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\[C@H]1[C@H](COP([O-])(=O)OP([O-])([O-])=O)[C@@]1(C)CC\\C=C(/C)CCC=C(C)C",
    "isopentenyl PP; (2E,6E,10E)-GGPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "isopentenyl PP; (2E,6E)-FPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "isopentenyl PP; (2E)-GPP": "CC(C)=CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "prephytoene PP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\C1C(COP(O)(=O)OP(O)(O)=O)C1(C)CC\\C=C(/C)CC\\C=C(/C)CCC=C(C)C",
    "(2E,6E,10E)-GGPP; isopentenyl PP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "Unknown": "Unknown",
    "isopentenyl PP": "CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "γ-carotene": "CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC=CC=C(C)C=CC=C(C)C=CC=C(C)CCC=C(C)C)C)C",
    "CDP": "CC(=CCOP(=O)(O)OP(=O)(O)O)CCC1C(=C)CCC2C1(CCCC2(C)C)C",
    "(2E)-GPP + IPP": "CC(C)=CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
}

### The main excel table.

In [72]:
df_main = pd.read_excel("../data/TPS-database_Nov19_23.xlsx")

In [73]:
df_main.columns

Index(['Uniprot ID', 'Name', 'Amino acid sequence', 'Species',
       'Kingdom (plant, fungi, bacteria)', 'Type (mono, sesq, di, …)',
       'Class (I or II)', 'Substrate (including stereochemistry)',
       'Substrate ChEBI ID', 'Cofactors', 'Name of product', 'Product %',
       'Product is major', 'Chemical formula of product',
       'SMILES of product (including stereochemistry)', 'Product ChEBI ID',
       'Cyclase', 'Fragment', 'Experimentally characterized', 'Reaction',
       'Notes', 'Publication'],
      dtype='object')

In [74]:
df_main.groupby("Type (mono, sesq, di, …)")["Uniprot ID"].nunique()

Type (mono, sesq, di, …)
Negative           6
di               166
di-int            73
fpps              92
gfpps             27
ggpps             90
gpps              69
hemi               4
hsqs               3
meroterpenoid      2
mono             253
mono-int           1
sesq             540
sesquar            2
sester            49
tetra             25
tetra-int          4
tri              171
tri-int           10
Name: Uniprot ID, dtype: int64

### Cleaning the database based on manual comments about the entries.

In [75]:
workbook = load_workbook("../data/TPS-database_Nov19_23.xlsx")
sheet1 = workbook.sheetnames[0]
worksheet = workbook[sheet1]

blacklisted_not_characterized = set()
for row in worksheet.iter_rows():
    cell = row[0]
    if cell.comment:
        if (
            "not characterized" in cell.comment.text
            or "not characetrized" in cell.comment.text
            or "could not verify" in cell.comment.text
            or "not find" in cell.comment.text
            or "ques-tion mark also remains over the direct products"
            in cell.comment.text
            or "wrongly annotated" in cell.comment.text
            or "full length protein was not active" in cell.comment.text
        ):
            blacklisted_not_characterized.add(cell.value)

In [76]:
df_main = df_main[~df_main["Uniprot ID"].isin(blacklisted_not_characterized)]

In [78]:
df_main.isnull().sum()

Uniprot ID                                          0
Name                                               55
Amino acid sequence                                 8
Species                                             4
Kingdom (plant, fungi, bacteria)                    3
Type (mono, sesq, di, …)                            5
Class (I or II)                                  2257
Substrate (including stereochemistry)              15
Substrate ChEBI ID                                 26
Cofactors                                        2180
Name of product                                     7
Product %                                        1700
Product is major                                 2167
Chemical formula of product                        29
SMILES of product (including stereochemistry)      26
Product ChEBI ID                                   42
Cyclase                                            97
Fragment                                         2369
Experimentally characterized

In [79]:
df_main.loc[
    df_main["Type (mono, sesq, di, …)"] == "Negative",
    "Substrate (including stereochemistry)",
] = "Negative"
name_2_smiles_orig.update({"Negative": "Negative"})

In [80]:
df_main["OK row flag"] = (
    ~df_main[
        [
            "Amino acid sequence",
            "Substrate (including stereochemistry)",
            # "SMILES of product (including stereochemistry)",
        ]
    ]
    .isnull()
    .any(axis=1)
    .astype(int)
)
df_main.loc[~df_main["Fragment"].isnull(), "OK row flag"] = 0

In [82]:
df_main["SMILES of substrate"] = df_main["Substrate (including stereochemistry)"].map(
    name_2_smiles_orig
)
df_main["SMILES of substrate"] = df_main["SMILES of substrate"].fillna("Unknown")

In [83]:
name_2_smiles_additional = {
    "(2E,6E)-FPP + IPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP([O-])(=O)OP([O-])([O-])=O",
    "(2E)-GPP + IPP": "CC(C)=CCC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP([O-])(=O)OP([O-])([O-])=O",
    "dimethylallyl PP+ IPP": "CC(=CCOP(=O)([O-])OP(=O)([O-])[O-])C.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "squalene": "CC(C)=CCC\C(C)=C\CC\C(C)=C\CC\C=C(/C)CC\C=C(/C)CCC=C(C)C",
    "(13E)-labda-7,13-dien-15-yl PP": "[C@H]1(CC/C(/C)=C/COP(OP(=O)(O)O)(=O)O)C(C)=CC[C@@]2([C@]1(C)CCCC2(C)C)[H]",
    "all-trans-lycopene": "CC(C)=CCC\C(C)=C\C=C\C(C)=C\C=C\C(C)=C\C=C\C=C(C)\C=C\C=C(C)\C=C\C=C(/C)CCC=C(C)C",
    "dimethylallyl PP + isopentenyl PP": "CC(=CCOP(=O)([O-])OP(=O)([O-])[O-])C.CC(=C)CCOP(=O)(O)OP(=O)(O)O",
    "(2E,6E)-FPP;  isopentenyl PP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O.CC(=C)CCOP([O-])(=O)OP([O-])([O-])=O",
    "(2E,6E)-FPP ": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "GGPP": "CC(C)=CCC\\C(C)=C\\CC\\C(C)=C\\CC\\C(C)=C\\COP([O-])(=O)OP([O-])([O-])=O",
    "syn-CPP": "C/C(=C\COP(=O)(O)OP(=O)(O)O)/CC[C@@H]1C(=C)CC[C@H]2[C@]1(CCCC2(C)C)C",
    "CPP": "C/C(=C\COP(=O)(O)OP(=O)(O)O)/CC[C@@H]1C(=C)CC[C@H]2[C@]1(CCCC2(C)C)C",
    "syn-CDP": "CC(=CCOP(=O)(O)OP(=O)(O)O)CCC1C(=C)CCC2C1(CCCC2(C)C)C",
    "(2Z,6Z,10Z)-NNPP": "CC(C)=CCCC(/C)=C/CCC(/C)=C/CCC(/C)=C/COP([O-])(OP([O-])([O-])=O)=O",
}
name_2_smiles_orig.update(name_2_smiles_additional)

In [84]:
df_main["SMILES of substrate"] = df_main["Substrate (including stereochemistry)"].map(
    name_2_smiles_orig
)
df_main = df_main[
    np.logical_not(df_main["SMILES of substrate"].isnull())
    | (df_main["Type (mono, sesq, di, …)"] == "Negative")
]

### Adding hard negatives (single-point mutants without TPS activity)

In [86]:
col_2_vals = {
    "Uniprot ID": ["Q9PDX7", "P0C0L4", "P06238", "P20742", "Q9GYW4"],
    "Amino acid sequence": [
        "".join(
            """MRDRVAMMLRPLVRGWIPRAVLLLTVAFSFGCNRNHNGQLPQSSGEPVAVAKEPVKGFVL
VRAYPDQHDGELALALEFSQPLAATQEFDTLVRLEQDSGNHDGGWSLSDDAKTLRYPYVE
ADKHYTVLISAGLLAATGSRLGKPRKEPVYTGELDPVVGFASRGSILPARGSRGVPVVSV
NVPEVDVEFMRVREKALPAFLARYHKAGQRSSWELSNQGNSRKRLSELADPVYVTRFVLD
GKKNERALTYLPIQSIRELREPGLYFAVMKPTGSFSDAFETAFFSVSNIGLHTRAYKDKL
FVHTASLRSGNPYKQVDLLVLDAKGETVLQGATDDNGNALLNYTLNAGHVLVSRNGRDIS
ILPFNQPALDLSEFAVAGRENPWFDVFAWSGRDLYRPGETLRISALLRDRDGKPVKPQPV
FLRLKQPDGKTFRETRLQPAEQGYLEFTQKIPSDAPTGRWRVEFRTDPASKEAVQGLAVR
VEEFLPERMKLELSSAQPVLRAKAPFTLTADAAYLYGAPAAGNRFTANLAVAVEQHPLDN
MPGWFFGDATLQLPRGAKETIDITLGADGHLVHDIVLPEEAKPVSPMAVVVSGSVYESGG
RPVTRSLKRVLWPADALVGVRPLFDVASGADANGMARFELTRVGVDGKPQSAKGLKATLV
RELRDYHWRYSDGRWDYDFTRRFENKETRTVDISTSHTTTLALPVEWGDYRLEVFDPVTG
LTMRYPFRAGWSWGDDNRGLDARPDKVKLALDKTSYRAGDTLKVTITPPHPGKGLLLVES
DKPLYVQAIDANPSTTLEIPVTADWERHDVYVTALVFRGGSASNNTTPARAVGEVYVPMQ
RKDRRVAVGLVVPKQMRPAQSLPVTVSVPELAGKQAHVTISAVDAGILNITGFPVPDAAA
HFFAQRRLSVDAYDIYGRVIESFEGGTGRLKFGGDMALPPLPQAKRPTARSQTVDLFSGA
VKLDAKGNAHIQLPVPDFNGALRVSALVYSDTRYGQRDAETVVRAPILAEASMPRVMAPG
DRSTVTVDVQNFTGKQGKFAVKVEGVGPLAVAEAGRSVTLGIDGKTTLNFPLRALEGNSV
AQVRVRVEGNGSKAERHYDLPVRAAWPQGLRTQAHVLNVLAPIAFDPALAKGLMPDSVNA
RLSVSALAPIPFASVLQGVFEYPYGCAEQTASKGYAALWLDDATIKSLGIHGVTPAQRLE
RLEGALGRLASLQTMNGHFSMCGGNSDVNPVLTPYIAGFLLDAKDAGFAVSDAVLQKALN
RLSEDLLSGAHLFYGNDQSEALMFAHQAWSGYVLARVNRAPLGTLRTLYDNERGKAVSGL
SLVHLGVALSLQGDRKRGEAAIEAGFAKSEGGRPEVFGDYGSVIRDNALMIALVRAHGLA
KPAYEARVMALGRDLEARRRSGWLWLSTQEQVALAQLGRALLVDQKKQVSGTLYVGKQRE
EIAASRLIGRSFDAAALARGVRFVPQGDVPLYASFEVAGIPRQAPVSDDSQLLVVRRWYT
VDGKPWTPGPLKEGQALIVRVSVTSKQNMPDALLTDLLPAGLEIDNFNLGETRQWADVTV
DGIALSERANAADIKHEEFRDDRYVAMLQLTGGRTANLFYLVRAVTPGTYKVPPSLVEDM
YRPALRGTGRVAPATVTVVQP""".split()
        ),
        "".join(
            """MRLLWGLIWASSFFTLSLQKPRLLLFSPSVVHLGVPLSVGVQLQDVPRGQVVKGSVFLRN
PSRNNVPCSPKVDFTLSSERDFALLSLQVPLKDAKSCGLHQLLRGPEVQLVAHSPWLKDS
LSRTTNIQGINLLFSSRRGHLFLQTDQPIYNPGQRVRYRVFALDQKMRPSTDTITVMVEN
SHGLRVRKKEVYMPSSIFQDDFVIPDISEPGTWKISARFSDGLESNSSTQFEVKKYVLPN
FEVKITPGKPYILTVPGHLDEMQLDIQARYIYGKPVQGVAYVRFGLLDEDGKKTFFRGLE
SQTKLVNGQSHISLSKAEFQDALEKLNMGITDLQGLRLYVAAAIIESPGGEMEEAELTSW
YFVSSPFSLDLSKTKRHLVPGAPFLLQALVREMSGSPASGIPVKVSATVSSPGSVPEVQD
IQQNTDGSGQVSIPIIIPQTISELQLSVSAGSPHPAIARLTVAAPPSGGPGFLSIERPDS
RPPRVGDTLNLNLRAVGSGATFSHYYYMILSRGQIVFMNREPKRTLTSVSVFVDHHLAPS
FYFVAFYYHGDHPVANSLRVDVQAGACEGKLELSVDGAKQYRNGESVKLHLETDSLALVA
LGALDTALYAAGSKSHKPLNMGKVFEAMNSYDLGCGPGGGDSALQVFQAAGLAFSDGDQW
TLSRKRLSCPKEKTTRKKRNVNFQKAINEKLGQYASPTAKRCCQDGVTRLPMMRSCEQRA
ARVQQPDCREPFLSCCQFAESLRKKSRDKGQAGLQRALEILQEEDLIDEDDIPVRSFFPE
NWLWRVETVDRFQILTLWLPDSLTTWEIHGLSLSKTKGLCVATPVQLRVFREFHLHLRLP
MSVRRFEQLELRPVLYNYLDKNLTVSVHVSPVEGLCLAGGGGLAQQVLVPAGSARPVAFS
VVPTAAAAVSLKVVARGSFEFPVGDAVSKVLQIEKEGAIHREELVYELNPLDHRGRTLEI
PGNSDPNMIPDGDFNSYVRVTASDPLDTLGSEGALSPGGVASLLRLPRGCGEQTMIYLAP
TLAASRYLDKTEQWSTLPPETKDHAVDLIQKGYMRIQQFRKADGSYAAWLSRDSSTWLTA
FVLKVLSLAQEQVGGSPEKLQETSNWLLSQQQADGSFQDPCPVLDRSMQGGLVGNDETVA
LTAFVTIALHHGLAVFQDEGAEPLKQRVEASISKANSFLGEKASAGLLGAHAAAITAYAL
TLTKAPVDLLGVAHNNLMAMAQETGDNLYWGSVTGSQSNAVSPTPAPRNPSDPMPQAPAL
WIETTAYALLHLLLHEGKAEMADQASAWLTRQGSFQGGFRSTQDTVIALDALSAYWIASH
TTEERGLNVTLSSTGRNGFKSHALQLNNRQIRGLEEELQFSLGSKINVKVGGNSKGTLKV
LRTYNVLDMKNTTCQDLQIEVTVKGHVEYTMEANEDYEDYEYDELPAKDDPDAPLQPVTP
LQLFEGRRNRRRREAPKVVEEQESRVHYTVCIWRNGKVGLSGMAIADVTLLSGFHALRAD
LEKLTSLSDRYVSHFETEGPHVLLYFDSVPTSRECVGFEAVQEVPVGLVQPASATLYDYY
NPERRCSVFYGAPSKSRLLATLCSAEVCQCAEGKCPRQRRALERGLQDEDGYRMKFACYY
PRVEYGFQVKVLREDSRAAFRLFETKITQVLHFTKDVKAAANQMRNFLVRASCRLRLEPG
KEYLIMGLDGATYDLEGHPQYLLDSNSWIEEMPSERLCRSTRQRAACAQLNDFLQEYGTQ
GCQV""".split()
        ),
        "".join(
            """MGKHRLRSLALLPLLLRLLLLLLPTDASAPQKPIYMVMVPSLLHAGTPEKACFLFSHLNE
TVAVRVSLESVRGNQSLFTDLVVDKDLFHCTSFTVPQSSSDEVMFFTVQVKGATHEFRRR
STVLVKKKESLVFAQTDKPIYKPGQTVRFRVVSLDESFHPLNELIPLLYIQDPKNNRIAQ
WQNFNLEGGLKQLSFPLSSEPTQGSYKVVIRTESGRTVEHPFSVEEFVLPKFEVRVTVPE
TITILEEEMNVSVCGIYTYGKPVPGRVTVNICRKYSNPSNCFGEESVAFCEKLSQQLDGR
GCFSQLVKTKSFQLKRQEYEMQLDVHAKIQEEGTGVEETGKGLTKITRTITKLSFVNVDS
HFRQGIPFVGQVLLVDGRGTPIPYETIFIGADEANLYINTTTDKHGLARFSINTDDIMGT
SLTVRAKYKDSNACYGFRWLTEENVEAWHTAYAVFSPSRSFLHLESLPDKLRCDQTLEVQ
AHYILNGEAMQELKELVFYYLMMAKGGIVRAGTHVLPLKQGQMRGHFSILISMETDLAPV
ARLVLYAILPNGEVVGDTAKYEIENCLANKVDLVFRPNSGLPATRALLSVMASPQSLCGL
RAVDQSVLLMKPETELSASLIYDLLPVKDLTGFPQGADQREEDTNGCVKQNDTYINGILY
SPVQNTNEEDMYGFLKDMGLKVFTNSNIRKPKVCERLRDNKGIPAAYHLVSQSHMDAFLE
SSESPTETRRSYFPETWIWDLVVVDSAGVAEVEVTVPDTITEWKAGAFCLSNDTGLGLSP
VVQFQAFQPFFVELTMPYSVIRGEAFTLKATVLNYLPTCIRVAVQLEASPDFLAAPEEKE
QRSHCICMNQRHTASWAVIPKSLGNVNFTVSAEALNSKELCGNEVPVVPEQGKKDTIIKS
LLVEPEGLENEVTFNSLLCPMGAEVSELIALKLPSDVVEESARASVTVLGDILGSAMQNT
QDLLKMPYGCGEQNMVLFAPNIYVLDYLNETQQLTQEIKTKAIAYLNTGYQRQLNYKHRD
GSYSTFGDKPGRNHANTWLTAFVLKSFAQARKYIFIDEVHITQALLWLSQQQKDNGCFRS
SGSLLNNAMKGGVEDEVTLSAYITIALLEMSLPVTHPVVRNALFCLDTAWKSARGGAGGS
HVYTKALLAYAFALAGNQDTKKEILKSLDEEAVKEEDSVHWTRPQKPSVSVALWYQPQAP
SAEVEMTAYVLLAYLTTEPAPTQEDLTAAMLIVKWLTKQQNSHGGFSSTQDTVVALHALS
KYGSATFTRAKKAAQVTIHSSGTFSTKFQVNNNNQLLLQRVTLPTVPGDYTVKVTGEGCV
YLQTSLKYSVLPREEEFPFTVVVQTLPGTCEDPKAHTSFQISLNISYTGSRSESNMAIAD
VKMVSGFIPLKPTVKMLERSVHVSRTEVSNNHVLIYLDKVSNQTVNLSFTVQQDIPIRDL
KPAVVKVYDYYEKDEFAVAKYSAPCSTDYGNA""".split()
        ),
        "".join(
            """MRKDRLLHLCLVLLLILLSASDSNSTEPQYMVLVPSLLHTEAPKKGCVLLSHLNETVTVS
ASLESGRENRSLFTDLVAEKDLFHCVSFTLPRISASSEVAFLSIQIKGPTQDFRKRNTVL
VLNTQSLVFVQTDKPMYKPGQTVRFRVVSVDENFRPRNELIPLIYLENPRRNRIAQWQSL
KLEAGINQLSFPLSSEPIQGSYRVVVQTESGGRIQHPFTVEEFVLPKFEVKVQVPKIISI
MDEKVNITVCGEYTYGKPVPGLATVSLCRKLSRVLNCDKQEVCEEFSQQLNSNGCITQQV
HTKMLQITNTGFEMKLRVEARIREEGTDLEVTANRISEITNIVSKLKFVKVDSHFRQGIP
FFAQVLLVDGKGVPIPNKLFFISVNDANYYSNATTNEQGLAQFSINTTSISVNKLFVRVF
TVHPNLCFHYSWVAEDHQGAQHTANRVFSLSGSYIHLEPVAGTLPCGHTETITAHYTLNR
QAMGELSELSFHYLIMAKGVIVRSGTHTLPVESGDMKGSFALSFPVESDVAPIARMFIFA
ILPDGEVVGDSEKFEIENCLANKVDLSFSPAQSPPASHAHLQVAAAPQSLCALRAVDQSV
LLMKPEAELSVSSVYNLLTVKDLTNFPDNVDQQEEEQGHCPRPFFIHNGAIYVPLSSNEA
DIYSFLKGMGLKVFTNSKIRKPKSCSVIPSVSAGAVGQGYYGAGLGVVERPYVPQLGTYN
VIPLNNEQSSGPVPETVRSYFPETWIWELVAVNSSGVAEVGVTVPDTITEWKAGAFCLSE
DAGLGISSTASLRAFQPFFVELTMPYSVIRGEVFTLKATVLNYLPKCIRVSVQLKASPAF
LASQNTKGEESYCICGNERQTLSWTVTPKTLGNVNFSVSAEAMQSLELCGNEVVEVPEIK
RKDTVIKTLLVEAEGIEQEKTFSSMTCASGANVSEQLSLKLPSNVVKESARASFSVLGDI
LGSAMQNIQNLLQMPYGCGEQNMVLFAPNIYVLNYLNETQQLTQEIKAKAVGYLITGYQR
QLNYKHQDGSYSTFGERYGRNQGNTWLTAFVLKTFAQARSYIFIDEAHITQSLTWLSQMQ
KDNGCFRSSGSLLNNAIKGGVEDEATLSAYVTIALLEIPLPVTNPIVRNALFCLESAWNV
AKEGTHGSHVYTKALLAYAFSLLGKQNQNREILNSLDKEAVKEDNLVHWERPQRPKAPVG
HLYQTQAPSAEVEMTSYVLLAYLTAQPAPTSGDLTSATNIVKWIMKQQNAQGGFSSTQDT
VVALHALSRYGAATFTRTEKTAQVTVQDSQTFSTNFQVDNNNLLLLQQISLPELPGEYVI
TVTGERCVYLQTSMKYNILPEKEDSPFALKVQTVPQTCDGHKAHTSFQISLTISYTGNRP
ASNMVIVDVKMVSGFIPLKPTVKMLERSSSVSRTEVSNNHVLIYVEQVTNQTLSFSFMVL
QDIPVGDLKPAIVKVYDYYETDESVVAEYIAPCSTDTEHGNV""".split()
        ),
        "".join(
            """MWQFIRSRILTVIIFIGAAHGLLVVGPKFIRANQEYTLVISNFNSQLSKVDLLLKLEGET
DNGLSVLNVTKMVDVRRNMNRMINFNMPEELTAGNYKITIDGQRGFSFHKEAELVYLSKS
ISGLIQVDKPVFKPGDTVNFRVILLDTELKPPARVKSVYVTIRDPQRNVIRKWSTAKLYA
GVFESDLQIVPTPMLGVWNISVEVEGEELVSKTFEVKEYVLSTFDVQVMPSVIPLEEHQA
VNLTIEANYHFGKPVQGVAKVELYLDDDKLNQKKELTVYGKGQVELRFDNFAMDADQQDV
RVKVSFIEQYTNRTVVKQSQITVYRYAYRVELIKESPQFRPGLPFKCALQFTHHDGTPAK
GITGKVEVSDVGFETTTTSDNDGLIKLELQPSEGTEQLGINFNAVDGFFFYEDVNKVETV
TDAYIKLELKSPIKRNKLMRFMVTCTERMTFFVYYVMSKGNIIDAGFMRPNKQTKYLLQL
NATEKMIPKAKILIATVAGRTVVYDYADLDFQELRNNFDLSIDEQEIKPGRQIELSMSGR
PGAYVGLAAYDKALLLFNKNHDLFWEDIGQVFDGFHAINENEFDIFHSLGLFARTLDDIL
FDSANEKTGRNALQSGKPIGKLVSYRTNFQESWLWKNVSIGRSGSRKLIEVVPDTTTSWY
LTGFSIDPVYGLGIIKKPIQFTTVQPFYIVENLPYSIKRGEAVVLQFTLFNNLGAEYIAD
VTLYNVANQTEFVGRPDTDLSYTKSVSVPPKVGVPISFLIKARKLGEMAVRVKASIMLGH
ETDALEKVIRVMPESLAQPKMDTSFFCFDDYKNQTFPFNLDINKKADNGSKKIEFRLNPN
LLTMVIKNLDNLLAVPTGCGEQNMVKFVPNILVLDYLYATGSKEQHLIDKATNLLRQGYQ
NQMRYRQTDGSFGVWEKSGSSVFLTAFVATSMQTASKYMNDIDAAMVEKALDWLASKQHS
SGRFDETGKVWHKDMQGGLRNGVALTSYVLTALLENDIAKVKHAVVIQNGMNYLSNQLAF
INNPYDLSIATYAMMLNGHTMKKEALDKLIDMSISDNNKKERYWGTTNQIETTAYALLSF
VMAEKYLDGIPVMNWLVNQRYVTGSFPRTQDTFVGLKALTKLAEKISPSRNDYTVQLKYK
KNTKYFNINSEQIDVQNFLEIPEDTKKLEINVGGIGFGLLEVIYQFDLNLVNFEHRFKLD
LEKQNTGSDYELRLRVCANYIPELTDSQSNMALIEVTLPSGYVVDRNPISEQTTVNPIQN
MEIRYGGTSVVLYYYKMGTERNCFTVTAYRRFKVALKRPAYVVVYDYYNTNLNAIKVYEV
DKQNVCEICEEEDCPAECKK""".split()
        ),
    ],
    "SMILES of substrate": ["Negative" for _ in range(5)],
    "Substrate (including stereochemistry)": ["Negative" for _ in range(5)],
    "Type (mono, sesq, di, …)": ["Negative" for _ in range(5)],
}
negatives2 = pd.DataFrame(
    {col: col_2_vals.get(col, ["Unknown" for _ in range(5)]) for col in df_main.columns}
)
df_main = pd.concat((df_main, negatives2))

### Canonical SMILES

In [87]:
from indigo import Indigo


def _get_canonical_smiles(smiles: str, without_stereo: bool = True):
    try:
        if isinstance(smiles, float) or smiles in {"Unknown", "Negative"}:
            return smiles
        indigo = Indigo()
        smiles_canonical = []
        mol = indigo.loadMolecule(smiles.strip())
        if without_stereo:
            mol.clearCisTrans()
            mol.clearStereocenters()
        return mol.canonicalSmiles()
    except:
        return np.nan


df_main["SMILES_substrate_canonical_no_stereo"] = df_main["SMILES of substrate"].map(
    _get_canonical_smiles
)

In [88]:
df_main["SMILES_product_canonical_no_stereo"] = "Unknown"
bool_idx = df_main["SMILES of product (including stereochemistry)"].map(
    lambda x: isinstance(x, str) and "k" not in x and "t" not in x
)
df_main.loc[bool_idx, "SMILES_product_canonical_no_stereo"] = df_main.loc[
    bool_idx, "SMILES of product (including stereochemistry)"
].map(_get_canonical_smiles)

In [102]:
# fixing multi-molecule substrates
df_main.loc[
    (df_main["Type (mono, sesq, di, …)"] == "tetra")
    & (
        df_main["SMILES_substrate_canonical_no_stereo"]
        == "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"
    ),
    "SMILES_substrate_canonical_no_stereo",
] = "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"

df_main.loc[
    (df_main["Type (mono, sesq, di, …)"] == "tetra")
    & (
        df_main["SMILES_substrate_canonical_no_stereo"]
        == "CC(C)CCCC(C)CCCC(C)CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"
    ),
    "SMILES_substrate_canonical_no_stereo",
] = "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"

df_main.loc[
    (df_main["Type (mono, sesq, di, …)"] == "tetra-int")
    & (
        df_main["SMILES_substrate_canonical_no_stereo"]
        == "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"
    ),
    "SMILES_substrate_canonical_no_stereo",
] = "CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"

df_main.loc[
    (df_main["Type (mono, sesq, di, …)"] == "tri")
    & (
        df_main["SMILES_substrate_canonical_no_stereo"]
        == "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"
    ),
    "SMILES_substrate_canonical_no_stereo",
] = "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"

df_main.loc[
    (df_main["Type (mono, sesq, di, …)"] == "tri-int")
    & (
        df_main["SMILES_substrate_canonical_no_stereo"]
        == "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"
    ),
    "SMILES_substrate_canonical_no_stereo",
] = "CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O.CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([O-])=O"

### Checking the number of carbons in substrate and product

In [103]:
def get_carbon_number(smiles):
    m = Chem.MolFromSmiles(smiles)
    patt = Chem.MolFromSmarts("[C]")
    pm = m.GetSubstructMatches(patt)
    return len(pm)


# original Jung's functions
def check_if_c_equal_in_2lists(string_list1, string_list2):
    ok_flag_list = []

    for s1, s2 in zip(string_list1, string_list2):
        if isinstance(s1, float) or isinstance(s2, float):
            ok_flag_list.append(False)
        else:
            c1, c2 = s1.lower().count("c"), s2.lower().count("c")
            ok_flag_list.append(True if c1 == c2 else False)
    return np.array(ok_flag_list)


def check_if_double_c_equal_in_2lists(string_list1, string_list2):
    ok_flag_list = []

    for s1, s2 in zip(string_list1, string_list2):
        if isinstance(s1, float) or isinstance(s2, float):
            ok_flag_list.append(False)
        else:
            c1, c2 = s1.lower().count("c"), s2.lower().count("c")
            ok_flag_list.append(True if c1 == 2 * c2 else False)
    return np.array(ok_flag_list)


valid_equal_c_np = check_if_c_equal_in_2lists(
    df_main["SMILES_product_canonical_no_stereo"].values,
    df_main["SMILES_substrate_canonical_no_stereo"].values,
)
df_main = df_main.loc[valid_equal_c_np]

### Cleaning the sequences

In [18]:
df_main = df_main.loc[
    df_main["Amino acid sequence"].map(lambda x: not isinstance(x, float))
]
df_main["Amino acid sequence"] = df_main["Amino acid sequence"].map(
    lambda x: x.replace("*", "").replace('"', "").replace("'", "")
)
df_main["Amino acid sequence"] = df_main["Amino acid sequence"].map(
    lambda x: "".join(x.split())
)

### Removing rows with crucial values missing, preserving full sequences only

In [19]:
df_main["OK row flag"] = (
    (
        ~df_main[
            [
                "Amino acid sequence",
                "SMILES_substrate_canonical_no_stereo",
                # "SMILES_product_canonical_no_stereo",
            ]
        ].isnull()
    )
    .any(axis=1)
    .astype(int)
)
df_main.loc[
    df_main["Fragment"].map(lambda x: isinstance(x, str) and x.lower() == "yes"),
    "OK row flag",
] = 0
df_main = df_main[df_main["OK row flag"] == 1]

In [20]:
df_main["is_substrate_predicted"] = 1
df_main.to_csv("../data/TPS-Nov19_2023_verified_all_reactions.csv", index=None)

### Preserving only major reactions

In [21]:
df_products_count = (
    df_main.groupby(["Uniprot ID", "SMILES_substrate_canonical_no_stereo"])[
        "SMILES_product_canonical_no_stereo"
    ]
    .count()
    .reset_index()
)

In [22]:
single_product_ids = set(
    df_products_count.loc[
        df_products_count["SMILES_product_canonical_no_stereo"] == 1,
        ["Uniprot ID", "SMILES_substrate_canonical_no_stereo"],
    ]
    .apply(
        lambda x: f'{x["Uniprot ID"]}_{x["SMILES_substrate_canonical_no_stereo"]}',
        axis=1,
    )
    .values
)

In [23]:
df_main["Multi-product TPS reaction"] = 1
df_main.loc[
    df_main[["Uniprot ID", "SMILES_substrate_canonical_no_stereo"]]
    .apply(
        lambda x: f'{x["Uniprot ID"]}_{x["SMILES_substrate_canonical_no_stereo"]}',
        axis=1,
    )
    .isin(single_product_ids),
    "Multi-product TPS reaction",
] = 0

In [24]:
major_product_data_bool_idx = (
    (df_main["Multi-product TPS reaction"] == 0)
    & (df_main["Product is major"].str.lower() != "no")
) | (
    (df_main["Multi-product TPS reaction"] == 1)
    & (df_main["Product is major"].str.lower() == "yes")
)

In [25]:
df_main.drop("Multi-product TPS reaction", axis=1, inplace=True)

In [26]:
df_main["product_is_major"] = 0
df_main.loc[major_product_data_bool_idx, "product_is_major"] = 1

In [27]:
df_main["product_is_major"].value_counts()

product_is_major
1    1315
0    1045
Name: count, dtype: int64

In [28]:
id_2_substrate_counts = (
    df_main.loc[~major_product_data_bool_idx]
    .groupby("Uniprot ID")["SMILES_substrate_canonical_no_stereo"]
    .nunique()
)

In [29]:
id_with_single_substrate = set(id_2_substrate_counts[id_2_substrate_counts == 1].index)

In [30]:
df_main["is_substrate_predicted"] = 0
df_main.loc[
    major_product_data_bool_idx
    | df_main["Uniprot ID"].map(lambda x: x in id_with_single_substrate),
    "is_substrate_predicted",
] = 1

substrates_series = df_main.groupby("Uniprot ID")[
    "SMILES_substrate_canonical_no_stereo"
].agg(lambda x: tuple(sorted(set(x))))

In [31]:
substrates_combo_counts = substrates_series.value_counts()
supported_substrates = set()
for substr_combo in substrates_combo_counts.index[substrates_combo_counts >= 10]:
    for substr in substr_combo:
        supported_substrates.add(substr)

In [32]:
(
    df_main["SMILES_substrate_canonical_no_stereo"].isin(supported_substrates)
).sum() / len(df_main)

0.9584745762711865

In [33]:
df_main["is_substrate_predicted"].value_counts()

is_substrate_predicted
1    2154
0     206
Name: count, dtype: int64

In [34]:
df_main = df_main[df_main["is_substrate_predicted"] == 1]

In [35]:
df_main

Unnamed: 0,Uniprot ID,Name,Amino acid sequence,Species,"Kingdom (plant, fungi, bacteria)","Type (mono, sesq, di, …)",Class (I or II),Substrate (including stereochemistry),Substrate ChEBI ID,Cofactors,...,Reaction,Notes,Publication,OK row flag,SMILES of substrate,SMILES_substrate_canonical_no_stereo,SMILES_product_canonical_no_stereo,inconsistent_substrate_product_carbon_number,is_substrate_predicted,product_is_major
0,A0A6P6W6H5,"Limonene synthase, chloroplastic",MAIINLPVPTNSSSEVNKHNHLRSCLPSGRATFTTLSAAAMRSATM...,Coffea arabica,Plantae,sesq,,"(2E,6E)-FPP",175763,,...,"(2E,6E)-farnesyl diphosphate = (E)-beta-farnes...",,,1,CC(C)=CCC\C(C)=C\CC\C(C)=C\COP([O-])(=O)OP([O-...,CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([...,CC(C)=CCCC(C)=CCCC(=C)C=C,1,1,1
1,XsTC-1,,MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLAD...,Xenia sp.,Coral,di,,"(2E,6E,10E)-GGPP",58756,,...,,highly expressed,https://www.nature.com/articles/s41589-022-010...,1,CC(C)=CCC\C(C)=C\CC\C(C)=C\CC\C(C)=C\COP([O-])...,CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP...,CC(C)=CCCC1(C)CC2C1CCC(C)=CCCC2=C,1,1,1
2,XsTC-1,,MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLAD...,Xenia sp.,Coral,sesq,,"(2E,6E)-FPP",175763,,...,,highly expressed,https://www.nature.com/articles/s41589-022-010...,1,CC(C)=CCC\C(C)=C\CC\C(C)=C\COP([O-])(=O)OP([O-...,CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([...,CC1CCC2C(CC2(C)C)C(=C)CCC=1,1,1,0
3,XsTC-1,,MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLAD...,Xenia sp.,Coral,sesq,,"(2E,6E)-FPP",175763,,...,,highly expressed,https://www.nature.com/articles/s41589-022-010...,1,CC(C)=CCC\C(C)=C\CC\C(C)=C\COP([O-])(=O)OP([O-...,CC(C)=CCCC(C)=CCCC(C)=CCOP([O-])(=O)OP([O-])([...,CC(C)(O)C1CC(C(C)=C)C(C)(CC1)C=C,1,1,0
5,X5AHD9,Diterpene synthase TPS4,MSITINLRVIAFPGHGVQSRQGIFAVMEFPRNKNTFKSSFAVKCSL...,Plectranthus barbatus (C. forskohlii),Plantae,di,,(+)-copalyl diphosphate,58635,,...,,,http://www.plantphysiol.org/content/164/3/1222,1,[H][C@@]12CCC(=C)[C@H](CC\C(C)=C\COP([O-])(=O)...,CC1(C)CCCC2(C)C1CCC(=C)C2CCC(C)=CCOP([O-])(=O)...,CC1(C)CCCC2(C)C1CCC1CC(=CCC2=1)C(C)C,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Q9PDX7,Unknown,MRDRVAMMLRPLVRGWIPRAVLLLTVAFSFGCNRNHNGQLPQSSGE...,Unknown,Unknown,Negative,Unknown,Negative,Unknown,Unknown,...,Unknown,Unknown,Unknown,1,Negative,Negative,Unknown,1,1,1
1,P0C0L4,Unknown,MRLLWGLIWASSFFTLSLQKPRLLLFSPSVVHLGVPLSVGVQLQDV...,Unknown,Unknown,Negative,Unknown,Negative,Unknown,Unknown,...,Unknown,Unknown,Unknown,1,Negative,Negative,Unknown,1,1,1
2,P06238,Unknown,MGKHRLRSLALLPLLLRLLLLLLPTDASAPQKPIYMVMVPSLLHAG...,Unknown,Unknown,Negative,Unknown,Negative,Unknown,Unknown,...,Unknown,Unknown,Unknown,1,Negative,Negative,Unknown,1,1,1
3,P20742,Unknown,MRKDRLLHLCLVLLLILLSASDSNSTEPQYMVLVPSLLHTEAPKKG...,Unknown,Unknown,Negative,Unknown,Negative,Unknown,Unknown,...,Unknown,Unknown,Unknown,1,Negative,Negative,Unknown,1,1,1


In [36]:
df_main.to_csv("../data/TPS-Nov19_2023_verified.csv", index=None)