In [1]:
import bioservices
import json

In [2]:
k = bioservices.kegg.KEGG()
parser = bioservices.kegg.KEGGParser()

### Configurables
Edit the folling options before running the scraper

In [3]:
keywords = ['FAD', 'FMN', 'flavin','flavoenzyme']
import_file = "export/kegg.json"
export_file = "export/kegg.json"

### Helper functions

In [4]:
def read_past_data(path=import_file):
    try:
        with open('export/kegg.json') as json_file:
            return json.load(json_file)
    except:
        return {}

In [5]:
def get_ids(keyword):
    results = k.find(database='enzyme', query=keyword)
    results_array = results.split('\n')
    ids_array = [i.split('\t')[0] for i in results_array if (i)]
    return ids_array

In [6]:
def get_ec_data(id):
    entry = k.get(id)
    ec_parse = parser.parse(entry)
    return ec_parse

In [7]:
def get_all_data(ids, previous_json, verbose=False):    
    data_dict = previous_json
    for id in ids:
        ec_data = get_ec_data(id)
        data_dict[id] = ec_data
        if verbose: print(". ", end="")    
    return data_dict

In [8]:
def scrape_kegg():
    print(f'Kegg scraping script started...')
    # Reading past results from KEGG
    previous_json = read_past_data()
    print(f'1. Successfully read previous json data, that has total of {len(previous_json)} records')
    prev_ids = list(previous_json.keys())
    
    # Getting IDs of all entries that are missing from past
    all_ids = set()
    for keyword in keywords:
        ids = get_ids(keyword)
        for id in ids:
            if (id not in prev_ids):
                all_ids.add(id)
                
    # If new ids have been found, fetch the data
    if len(all_ids) > 0:
        print(f'2. Following potential flavins are missing from past results:')
        [print(f'{ec}', end=" | ") for ec in all_ids]
    
        # Scraping the data
        print(f'\n3. Fetching the data')
        flavins = get_all_data(all_ids, previous_json, verbose=True)
        
        # Writing out the results to the file
        with open(export_file, 'w') as outfile:
            json.dump(flavins, outfile)
        print(f'\nSuccessfully written out {len(all_ids)} results to "{export_file}"')
    else:
        print("[i] Doesn't look like there are any new flavins on KEGG!")

### Running the program

In [9]:
scrape_kegg()

Kegg scraping script started...
1. Successfully read previous json data, that has total of 0 records
2. Following potential flavins are missing from past results:
ec:1.2.99.7 | ec:6.2.1.57 | ec:1.14.14.5 | ec:1.14.19.33 | ec:1.14.19.43 | ec:1.5.1.40 | ec:2.6.1.114 | ec:1.1.5.4 | ec:1.14.14.9 | ec:3.6.1.18 | ec:1.5.1.39 | ec:1.14.14.27 | ec:1.14.19.34 | ec:6.2.1.50 | ec:3.4.22.61 | ec:1.14.13.111 | ec:1.13.11.79 | ec:1.14.19.3 | ec:1.14.13.32 | ec:1.1.5.9 | ec:1.5.1.38 | ec:1.5.1.41 | ec:1.14.19.42 | ec:4.3.3.5 | ec:1.5.1.37 | ec:4.3.1.32 | ec:3.1.3.102 | ec:1.14.19.35 | ec:1.14.14.8 | ec:1.14.14.3 | ec:2.1.1.349 | ec:2.7.10.2 | ec:4.6.1.15 | ec:2.1.1.343 | ec:1.14.14.34 | ec:2.7.1.26 | ec:1.2.99.8 | ec:1.14.13.8 | ec:1.14.19.31 | ec:2.1.1.148 | ec:1.12.98.1 | ec:1.5.1.36 | ec:1.14.13.148 | ec:1.14.19.22 | ec:2.7.1.180 | ec:6.2.1.42 | ec:1.6.2.4 | ec:2.7.1.42 | ec:1.5.1.30 | ec:2.7.1.161 | ec:1.14.13.113 | ec:1.14.19.44 | ec:6.2.1.51 | ec:1.1.2.3 | ec:1.4.3.4 | ec:2.5.1.9 | ec:1.7.1.17 

In [None]:
data = read_past_data()

In [None]:
list(data.values())[1]

### TODO:
- look at other databases
- might want to ommit `genes` key
- smile strings
- filtering search strategy
 - lose FAD-independent 
- merge the info from other DBs for the same entries

In [None]:
data['ec:1.1.5.3']

{'ENTRY': 'EC 1.1.5.3                  Enzyme',
 'NAME': ['glycerol-3-phosphate dehydrogenase;',
  'alpha-glycerophosphate dehydrogenase;',
  'alpha-glycerophosphate dehydrogenase (acceptor);',
  'anaerobic glycerol-3-phosphate dehydrogenase;',
  'DL-glycerol 3-phosphate oxidase (misleading);',
  'FAD-dependent glycerol-3-phosphate dehydrogenase;',
  'FAD-dependent sn-glycerol-3-phosphate dehydrogenase;',
  'FAD-GPDH;',
  'FAD-linked glycerol 3-phosphate dehydrogenase;',
  'FAD-linked L-glycerol-3-phosphate dehydrogenase;',
  'flavin-linked glycerol-3-phosphate dehydrogenase;',
  'flavoprotein-linked L-glycerol 3-phosphate dehydrogenase;',
  'glycerol 3-phosphate cytochrome c reductase (misleading);',
  'glycerol phosphate dehydrogenase;',
  'glycerol phosphate dehydrogenase (acceptor);',
  'glycerol phosphate dehydrogenase (FAD);',
  'glycerol-3-phosphate CoQ reductase;',
  'glycerol-3-phosphate dehydrogenase (flavin-linked);',
  'glycerol-3-phosphate:CoQ reductase;',
  'glycerophosph