In [83]:
import sys
import glob
import os
import pandas as pd
import json
import urllib.request
import tarfile
import pprint

In [87]:
class parseRheaReactions(object):
    def __init__(self):
        self.endpoint = 'ftp://ftp.ebi.ac.uk/pub/databases/rhea/'
        self.reaction_data = self.get_rhea_reaction_rd_data() 
        self.r2c = self.rhea2chebi()
        self.ecData = self.get_rhea_ec_tsv_data()
        self.rhea_chebi_ec = self.rhea_chebi2ec()

    def get_rhea_reaction_rd_data(self):
        #         retrieve rhea id with linked chebi ids
        #         returns unarchived collection of rd files for each rhea id
        rd_path = 'ctfiles/rhea-rd.tar.gz'
        reactions_tar = urllib.request.urlretrieve(self.endpoint + rd_path, 'rd.tar.gz')
        reactions = tarfile.open(reactions_tar[0], 'r')
        reactions.extractall()
        reactions.close()
        
    def get_rhea_ec_tsv_data(self):
        #         retrieve ec rhea mapping
        #         return tempfile of ec 2 rhea mapping 
        ec_path = 'tsv/ec-rhea-dir.tsv'    
        ec2rhea = urllib.request.urlretrieve(self.endpoint + ec_path)
        return ec2rhea[0]
                
    def get_chebi_names_list(self):
        #        retrieve tsv of chebi name id pairs
        chebi_list_path = 'tsv/chebiId_name.tsv'
        chebiid2name = urllib.request.urlretrieve(self.endpoint + chebi_list_path)
        cheb_dict = {}
        with open(chebiid2name[0]) as cnl:
            for line in cnl: 
                line = line.strip()
                cnl_list = line.split("\t")
                cheb_dict[cnl_list[0]] = cnl_list[-1]
        return cheb_dict
                       
    def rhea2chebi(self):
        #       get chebi ids for reaction from each rhea rd file
        rxn_all = []
        for filename in glob.glob(os.path.join('rd/*.rd')):
            rxn = {}
            rheaid1 = filename.split('.')[0]
            rheaid = rheaid1.split('/')[1]
            with open(filename) as fp:
                rxn['rhea_id'] = rheaid
                reaction = []
                for line in fp:
                    line = line.strip()
                    if 'CHEBI:' in line:
                        reaction.append(line)
                rxn['chebi_id'] = reaction
                rxn_all.append(rxn)
        return rxn_all
    
    def rhea_chebi2ec(self):
        #      map ec to rxn with rhea ids
        rxn_all = []
        for rxn in self.r2c:
            with open(self.ecData) as ec:
                for line in ec:
                    line = line.strip()
                    eclist = line.split("\t")
                    ec = eclist[0]
                    rhea = eclist[1]
                    if rxn['rhea_id'] == rhea:
                        rxn['ecnumber'] = ec
                        rxn_all.append(rxn)
        return rxn_all
    

In [85]:
rhea = parseRheaReactions()

In [86]:
pprint.pprint(rhea.rhea_chebi_ec2uniprot())

{'rhea_id': '10001', 'chebi_id': ['CHEBI:15377', 'CHEBI:16459', 'CHEBI:28938', 'CHEBI:31011'], 'ecnumber': '3.5.1.50'}
{'rhea_id': '10005', 'chebi_id': ['CHEBI:17484', 'CHEBI:16017'], 'ecnumber': '5.99.1.1'}
{'rhea_id': '10009', 'chebi_id': ['CHEBI:29950', 'CHEBI:29950', 'CHEBI:35924', 'CHEBI:50058', 'CHEBI:30879', 'CHEBI:15377'], 'ecnumber': '1.11.1.15'}
{'rhea_id': '10013', 'chebi_id': ['CHEBI:58413', 'CHEBI:15377', 'CHEBI:15379', 'CHEBI:58682', 'CHEBI:16240'], 'ecnumber': '1.5.3.6'}
{'rhea_id': '10017', 'chebi_id': ['CHEBI:16353', 'CHEBI:15377', 'CHEBI:15354', 'CHEBI:15378', 'CHEBI:30023'], 'ecnumber': '3.1.1.49'}
{'rhea_id': '10021', 'chebi_id': ['CHEBI:57951', 'CHEBI:15377', 'CHEBI:58349', 'CHEBI:58321', 'CHEBI:29985', 'CHEBI:15378', 'CHEBI:57783'], 'ecnumber': '1.5.1.10'}
{'rhea_id': '10025', 'chebi_id': ['CHEBI:59789', 'CHEBI:29969', 'CHEBI:57856', 'CHEBI:61929', 'CHEBI:15378'], 'ecnumber': '2.1.1.43'}
{'rhea_id': '10029', 'chebi_id': ['CHEBI:29986', 'CHEBI:15377', 'CHEBI:15379'

In [2]:
def rhea2chebi(path):
    rxn_all = []
    for filename in glob.glob(os.path.join(path,'*.rd')):
        rxn = {}
        rheaid1 = filename.split('.')[0]
        rheaid = rheaid1.split('/')[1]
        with open(filename) as fp:
            rxn['rhea_id'] = rheaid
            reaction = []
            for line in fp:
                line = line.strip()
                if 'CHEBI:' in line:
                    chebi = line.split(":")[-1]
                    reaction.append(chebi)
            rxn['chebi_id'] = reaction
            rxn_all.append(rxn)
    return rxn_all
            

In [22]:
def addECPd2(file_path, reaction_list):
    joined_reactions = []
    with open(file_path) as fp:
        for line in fp:
            line = line.strip()
            cols = line.split("\t")
            ec = cols[0]
            rhea = cols[1]
            for rxn in reaction_list:
                if rxn['rhea_id'] == rhea:
                    rxn['ec_number'] = ec
                    joined_reactions.append(rxn)
    return joined_reactions
    

In [65]:
def addChebiNames(file_path, reaction_list):
    with open(file_path) as fp:
        named_chems = []
        count = 0 
        for line in fp:
            
            line = line.strip()
            cols = line.split("\t")
            chebiID = cols[0].split(":")[-1]
            chebiName = cols[1]
            for rxn in reaction_list:
                named = {}
                for cid in rxn['chebi_id']:
                    if chebiID == cid:
                        named[chebiName] = cid
                named_chems.append(named)
        return named_chems
                    
                        
            
            
            
            

In [66]:
rhea_chebi_ec_names = addChebiNames('tsv/tsv/chebiId_name.tsv', rhea_chebi_ec)

In [30]:
rhea_chebi = rhea2chebi('rd/')

In [31]:
rhea_chebi_ec = addECPd2('tsv/tsv/ec-rhea-dir.tsv', rhea_chebi)

In [62]:
rhea_chebi_ec

[{'chebi_id': ['16236', '57540', '15343', '15378', '57945'],
  'ec_number': '1.1.1.1',
  'rhea_id': '25291'},
 {'chebi_id': ['35681', '57540', '17087', '15378', '57945'],
  'ec_number': '1.1.1.1',
  'rhea_id': '10741'},
 {'chebi_id': ['15734', '57540', '17478', '15378', '57945'],
  'ec_number': '1.1.1.71',
  'rhea_id': '10737'},
 {'chebi_id': ['15734', '58349', '17478', '15378', '57783'],
  'ec_number': '1.1.1.71',
  'rhea_id': '15938'},
 {'chebi_id': ['57476', '58349', '537519', '15378', '57783'],
  'ec_number': '1.1.1.3',
  'rhea_id': '15762'},
 {'chebi_id': ['57476', '57540', '537519', '15378', '57945'],
  'ec_number': '1.1.1.3',
  'rhea_id': '15758'},
 {'chebi_id': ['16982', '57540', '15686', '15378', '57945'],
  'ec_number': '1.1.1.4',
  'rhea_id': '24341'},
 {'chebi_id': ['17754', '57540', '16016', '15378', '57945'],
  'ec_number': '1.1.1.6',
  'rhea_id': '13770'},
 {'chebi_id': ['57540', '58352', '15378', '57290', '57945'],
  'ec_number': '1.1.1.7',
  'rhea_id': '21585'},
 {'che

In [50]:
rhea_chebi_ec_names = addChebiNames('tsv/tsv/chebiId_name.tsv', rhea_chebi_ec)

7  (+)-car-3-ene
20  (1R,4S)-camphene
28  (R)-linalool
40  (+)-pinoresinol
89  (1S,4R)-camphene
98  (S)-linalool
100  (-)-medicarpin
128  (S)-alpha-terpineol
129  (-)-beta-phellandrene
149  (1R,2R,4R)-dihydrocarveol
150  (1R,2R,4S)-isodihydrocarveol
152  (1R,2S,4R)-neodihydrocarveol
153  (1R,2S,4S)-neoisodihydrocarveol
154  (1R,4R)-dihydrocarvone
155  (1R,4S)-isodihydrocarvone
158  (1S,2R,4S)-neodihydrocarveol
165  (1S,4R)-fenchone
166  (1S,4R)-isodihydrocarvone
168  (1S,4S)-dihydrocarvone
228  (4R,7R)-4-isopropenyl-7-methyloxepan-2-one
232  (1S,5S)-carveol
233  (4S,7R)-4-isopropenyl-7-methyloxepan-2-one
300  (R)-alpha-terpineol
685  1-organyl-2-acetyl-sn-glycero-3-phospholipid
732  5beta-dihydrocortisol
776  16alpha-hydroxyestrone
798  19-hydroxytestosterone
799  19-oxo-androst-4-ene-3,17-dione
920  2,4-dinitrotoluene
1107  3-(all-trans-hexaprenyl)benzene-1,2-diol
1109  2-methoxy-6-(all-trans-hexaprenyl)phenol
1110  2-all-trans-hexaprenylphenol
1156  2-hydroxyestrone
1178  (2S)-2-isop

In [None]:
print(rhea_chebi_ec_names)
