In [21]:
import pandas as pd
from io import StringIO
import requests
import lxml
import importlib
import json

from cache import cached_reqest
from cache import generic_cached_reqest

### Configurables
Edit the folling options before running the scraper
- Add the terms to search for on Brenda's website here:
- Make sure the output and input files are what you want

In [87]:
terms = ['FAD','FMN','flavoenzyme','flavin', 'flavoprotein']
import_file = "export/brenda.json"
# made the filename different so that old one can not be overriden
export_file = "export/brenda_new_export.json"

In [4]:
def brenda_request(url):
    headers = {
      "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
      "X-Requested-With": "XMLHttpRequest"
    }
    response = cached_reqest(url, headers=headers)
    return response
    

In [5]:
def search_ligands_brenda(term):
    columns = ['Ligand','EC Number', 'Role', 'Id', 'Structure', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=13&RN=&RNV=1&os=1&pt=&FNV=1&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&V[1]=1&V[2]=2&W[3]={term}&T[3]=2&nolimit=1'
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [6]:
def search_enzymes_brenda(term):
    columns = ['EC Number', 'Recommended Name', 'Synonyms', 'Commentary', 'Discard']
    url = f'https://www.brenda-enzymes.org/result_download.php?a=9&RN=&RNV=1&os=1&pt=&FNV=&tt=&SYN=&Textmining=&T[0]=2&T[1]=2&W[2]={term}&T[2]=2&nolimit=1'    
    response = brenda_request(url)
    df = pd.read_csv(StringIO(response), sep='\t', names=columns)
    return df

In [7]:
def brenda_get_enzyme_data(id):
    link = f'https://www.brenda-enzymes.info/enzyme.php?ecno={id}#NATURAL%20SUBSTRATE'
    response = brenda_request(link)
    return response

In [8]:
def search_all_terms(terms, search_fn):
    all_dfs = []

    for term in terms:
        df = search_fn(term)
        if (len(df) < 2):
            print(f'[!] skipping search for {term} since nothing was found')
            continue
        else:
            all_dfs.append(df)
        
        print(f'len of {term} in {search_fn} is:{len(df)}')
    return pd.concat(all_dfs)

In [9]:
def get_all_ecs(terms):
    
    enzymes_list = set(search_all_terms(terms,search_enzymes_brenda)['EC Number'])
    ligands_list = set(search_all_terms(terms,search_ligands_brenda)['EC Number'])

    ec_set = enzymes_list|ligands_list
    print(f'total ecs found: {len(ec_set)}')

    return ec_set

# SOAP helper 

In [10]:
import zeep


from zeep import Client
import hashlib

email = 'si485@dispostable.com'
password = 'si485@dispostable'

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256(password.encode("utf-8")).hexdigest()
client = Client(wsdl)

In [11]:
poster_child = '1.14.13.2'

In [25]:
def brendaSOAP(parameters, fn):
    # resp = client.service[fn](*parameters)
    resp = generic_cached_reqest(request_name=f'brenda_{fn}', params=parameters, request_fn=client.service[fn])
    return resp

def getSynonyms(ecNumber):
    parameters = (email, password, f"ecNumber*{ecNumber}", 'organism*', 'synonyms*', 'commentary*', 'literature*')
    return brendaSOAP(parameters, 'getSynonyms')

def getReactions(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "reaction*", "commentary*", "literature*", "organism*")
    return brendaSOAP(parameters, 'getReaction')

def getSystematicName(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "systematicName*")
    return brendaSOAP(parameters, 'getSystematicName')

def getSubstrate(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "substrate*", "reactionPartners*", "ligandStructureId*")
    return brendaSOAP(parameters,'getSubstrate')

def getProduct(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "product*", "reactionPartners*", "ligandStructureId*")
    return brendaSOAP(parameters,'getProduct')

def getPdb(ecNumber):
    parameters = (email, password,f"ecNumber*{ecNumber}", "organism*", "pdb*")
    return brendaSOAP(parameters,'getPdb')

In [None]:
all_ecs = get_all_ecs(terms)

### Initializing BrendaDB

In [42]:
def read_past_data(path=import_file):
    try:
        with open(path) as json_file:
            return json.load(json_file)
    except:
        return {}

In [43]:
db = read_past_data(import_file)

# Example of the BrendaDB:
# brenda_enzymes = {
#             ecNumber: {
#                 'SYSNAME': "...", # should this be an array?
#                 'REACTIONS': [{reaction},],
#                 'NAME': ['...',],
#                 'SUBSTRATE': [{substrate},],
#                 'PRODUCT': [{product},],
#                 'PDB': [{pdb}]
#             },
#         }

In [46]:
def create_brenda_ec_entry(ec):
    return {
        'SYSNAME': getSystematicName(ec),
        'REACTIONS': getReactions(ec),
        'NAME': getSynonyms(ec),
        'SUBSTRATE': getSubstrate(ec),
        'PRODUCT': getProduct(ec),
        'PDB': getPdb(ec)
    }
     

In [52]:
def create_brenda_db(list_of_ecs,db):
    for index, ec in enumerate(list_of_ecs):
        if ec not in db:
            try:
                entry = create_brenda_ec_entry(ec)
                sysnames = entry['SYSNAME']
                entry_name = sorted(sysnames)[0]['ecNumber']
                db[entry_name] = (entry)
                percent_done = round((index+1)/len(list_of_ecs)*100)
                # this is just to show progress done:
                done = (percent_done/10)
                done_str = '█'*int(done)
                togo = (10-(percent_done/10))
                togo_str = '░'*int(togo)
                print(f'\n\n----> Progress: {done_str}{togo_str} \t{percent_done}% done')
            except:
                print(f"couldn't fetch info for {ec}")
    
    # Writing out the results to the file
    with open(export_file, 'w') as outfile:
        json.dump(db, outfile)
    print(f'\nSuccessfully written out {len(db)} results to "{export_file}"')

    return db

In [None]:
sorted_ecs = sorted(list(all_ecs))
db = create_brenda_db(db)

In [88]:
# check if there are ever more then two sysnames
[short_db[ez_name]['SYSNAME'][0]['ecNumber'] for ez_name in short_db.keys() if len(short_db[ez_name]['SYSNAME']) > 1]

[]

In [60]:
# dataframe to easily work with
df = pd.DataFrame(db).T