# Experimental features

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import cast

import deepmolecules
import matplotlib.pyplot as plt
import pandas as pd
from parameteriser import (
    plot_parameter_distributions,
    select_substrate,
)
from parameteriser.brenda.v0 import Brenda


def download_top_n_sequences(ec: str, taxonomy_id: int, n: int = 100) -> pd.DataFrame:
    import zlib
    from io import BytesIO
    from urllib.parse import quote

    import requests

    url = (
        "https://rest.uniprot.org/uniprotkb/search?"
        "compressed=true"
        + quote("&fields=accession,organism_name,sequence", safe="&=+")
        + "&format=tsv"
        + quote(f"&query=((ec:{ec})+AND+(taxonomy_id:{taxonomy_id}))", safe="&=+")
        + f"&size={n}"
    )

    return pd.read_csv(
        BytesIO(
            zlib.decompress(
                requests.get(url, timeout=60).content,
                wbits=16 + zlib.MAX_WBITS,
            ),
        ),
        sep="\t",
    )

In [None]:
brenda = Brenda()

if (path := Path.home() / "Documents" / "brenda_2023_1.json").exists():
    brenda.read_database(path)

## Check distribution of predicted kms for a wide range of organisms

- Viridiplantae (plants & algae)
  - embryophyta (land plants, taxonomy_id: 3193)
- get protein sequence for all plants / algea?

In [None]:
ec = "4.1.1.39"
brenda_substrate = "CO2"
kegg_substrate = "C00011"  # CO2
embryophyta = 3193  # embroyphyta

seqs = download_top_n_sequences(ec=ec, taxonomy_id=embryophyta, n=100)
seqs = seqs.iloc[seqs["Organism"].drop_duplicates().index]
brenda_kms = cast(
    pd.Series,
    (
        select_substrate(
            brenda.get_kms_and_kcats(ec=ec)[0],
            brenda_substrate,
        )["value"]
    ),
)
predicted_kms = deepmolecules.km.predict(
    [kegg_substrate] * len(seqs),
    seqs["Sequence"].values,
)["KM [mM]"]
fig, ax = plot_parameter_distributions(
    brenda_kms,
    predicted_kms,
    organism_name=f"{ec} - {brenda_substrate}",
)
plt.show()