# Porting genome scale metabolic models for metabolomics
- from gapSeq

Minghao Gong, 2022-08-02

In [246]:
# !pip install cobra --user --ignore-installed ruamel.yaml
# !pip install --upgrade metDataModel # https://github.com/shuzhao-li/metDataModel/ 
# !pip install --upgrade numpy pandas

In [247]:
import cobra # https://cobrapy.readthedocs.io/en/latest/io.html#SBML
from metDataModel.core import Compound, Reaction, Pathway, MetabolicModel
import requests
import sys
import re

sys.path.append("/Users/gongm/Documents/projects/mass2chem/")
sys.path.append("/Users/gongm/Documents/projects/JMS/JMS/JMS")
from mass2chem.formula import *
from jms.formula import *
from jms.utils.gems import *
from jms.utils.git_download import *

In [248]:
# Somehow the function was not corrected in jms.utils.gems
# I use this function cover the bugged one.
def neutral_formula2mass(neutral_formula):
    '''
    Convert neutral formula to mass but removing characters (e.g., X, R) typical in GEM but not ready for metabolomics application
    '''
    formula_dict = parse_chemformula_dict(neutral_formula)
    if ("R" not in formula_dict) & ("X" not in formula_dict) & ("Z" not in formula_dict) & (len(formula_dict) != 0) :
        mono_mass = calculate_mass(formula_dict,6)
    else:
        mono_mass = float(0)
    return(mono_mass)

In [249]:
# download the most updated Rat-GEM.xml
model_name = 'epi_CA7-draft-gapseq'
file_name = 'epi_CA7-draft.xml'
local_path = output_fdr = '../epi_benchmark/draft/'

In [250]:
# Read the model via cobra
model = cobra.io.read_sbml_model(os.path.join(local_path,file_name))

In [251]:
model

0,1
Name,epi_CA7
Memory address,0x07fb66dc4c670
Number of metabolites,1776
Number of reactions,2042
Number of groups,1099
Objective expression,1.0*bio1 - 1.0*bio1_reverse_b18f7
Compartments,"c0, e0, p0"


In [252]:
# reaction entries, Readily convert to list of reactions
model.reactions[33].__dict__

{'_id': 'rxn00120_c0',
 'name': 'UTP phosphohydrolase',
 'notes': {},
 '_annotation': {'sbo': ['SBO:0000167', 'SBO:0000176'],
  'ec-code': '3.6.1.15',
  'seed.reaction': ['rxn00120'],
  'kegg.reaction': 'R00159',
  'bigg.reaction': ['NDP8ex', 'NTP7'],
  'biocyc': 'META:RXN-12196',
  'metanetx.reaction': 'MNXR101933'},
 '_gene_reaction_rule': '',
 'subsystem': '',
 '_genes': set(),
 '_metabolites': {<Metabolite cpd00001_c0 at 0x7fb663b9b6d0>: -1.0,
  <Metabolite cpd00062_c0 at 0x7fb66d89cd00>: -1.0,
  <Metabolite cpd00009_c0 at 0x7fb665136d60>: 1.0,
  <Metabolite cpd00067_c0 at 0x7fb665136d00>: 1.0,
  <Metabolite cpd00014_c0 at 0x7fb663b199d0>: 1.0},
 '_model': <Model epi_CA7 at 0x7fb66dc4c670>,
 '_lower_bound': 0.0,
 '_upper_bound': 1000.0}

In [253]:
model.groups[0]

<Group subsys_12DICHLORETHDEG_PWY at 0x7fb67104c9a0>

In [254]:
model.metabolites[100].__dict__

{'_id': 'cpd00041_c0',
 'name': 'L-Aspartate-c0',
 'notes': {},
 '_annotation': {'sbo': 'SBO:0000247',
  'metanetx.chemical': 'MNXM42',
  'inchikey': 'CKLJMWTZIZZHCS-REOHCLBHSA-M',
  'seed.compound': 'cpd00041',
  'hmdb': ['HMDB00191', 'HMDB62186', 'HMDB62501'],
  'reactome': ['R-ALL-113553', 'R-ALL-29448'],
  'kegg.compound': ['C00049', 'C16433'],
  'chebi': ['CHEBI:132943',
   'CHEBI:17053',
   'CHEBI:21247',
   'CHEBI:40853',
   'CHEBI:40900',
   'CHEBI:40913',
   'CHEBI:40942',
   'CHEBI:6193',
   'CHEBI:22660',
   'CHEBI:29991',
   'CHEBI:13085',
   'CHEBI:21244',
   'CHEBI:29993',
   'CHEBI:29995',
   'CHEBI:35391',
   'CHEBI:22659',
   'CHEBI:29992'],
  'bigg.metabolite': 'asp__L',
  'biocyc': 'META:L-ASPARTATE'},
 '_model': <Model epi_CA7 at 0x7fb66dc4c670>,
 '_reaction': {<Reaction bio1 at 0x7fb670def700>,
  <Reaction rxn00260_c0 at 0x7fb66593f9a0>,
  <Reaction rxn00337_c0 at 0x7fb66595cca0>,
  <Reaction rxn00346_c0 at 0x7fb665961a30>,
  <Reaction rxn00347_c0 at 0x7fb66594f910

## Port metabolite

In [255]:
def port_metabolite(M, annotation_as_dict = True):
    '''
    convert cobra Metabolite to metDataModel Compound
    Annotation needs to be converted to dictionary structure rather than list of tuples for mummichog
    '''
    Cpd = Compound()
    Cpd.src_id = remove_compartment_by_split(M.id,'_')
    Cpd.id = remove_compartment_by_split(M.id,'_')              # temporarily the same with the source id
    Cpd.name = M.name.rsplit('-',1)[0]
    Cpd.charge = M.charge
    Cpd.neutral_formula = adjust_charge_in_formula(M.formula,M.charge)
    Cpd.neutral_mono_mass = neutral_formula2mass(Cpd.neutral_formula)
    Cpd.charged_formula = M.formula
    
    if annotation_as_dict == True:
        Cpd.db_ids = {}
        Cpd.db_ids[model_name] = Cpd.src_id
    else:
        Cpd.db_ids = [[model_name,Cpd.src_id]] # using src_id to also reference Rat-GEM ID in db_ids field
    for k,v in M.annotation.items():
        if k not in ['sbo','hmdb','kegg.compound']:
            if annotation_as_dict == True:
                Cpd.db_ids[k] = v
            else:
                if isinstance(v,list):
                    Cpd.db_ids.append([[k,x] for x in v])
                else: 
                    if ":" in v:
                        Cpd.db_ids.append([k,v.split(":")[1]])
                    else:
                        Cpd.db_ids.append([k,v])
                
    return Cpd
    Cpd.db_ids = listOfTuple2dict(tuple([tuple(x) for x in Cpd.db_ids]))
    
    inchi_list = [x[1].split('=')[1] for x in Cpd.db_ids if x[0] == 'inchi']
    if len(inchi_list) ==1:
        Cpd.inchi = inchi_list[0]
    elif len(inchi_list) >1:
        Cpd.inchi = inchi_list
        
    return Cpd

In [256]:
myCpds = []
for i in range(len(model.metabolites)):
    myCpds.append(port_metabolite(model.metabolites[i]))

In [257]:
myCpds[0].db_ids

{'epi_CA7-draft-gapseq': 'cpd00001',
 'metanetx.chemical': 'MNXM2',
 'inchikey': 'XLYOFNOQVPJJNP-UHFFFAOYSA-N',
 'seed.compound': 'cpd00001',
 'reactome': ['R-ALL-109276',
  'R-ALL-113518',
  'R-ALL-113519',
  'R-ALL-113521',
  'R-ALL-141343',
  'R-ALL-1605715',
  'R-ALL-189422',
  'R-ALL-2022884',
  'R-ALL-29356',
  'R-ALL-351603',
  'R-ALL-5278291',
  'R-ALL-5668574',
  'R-ALL-5693747',
  'R-ALL-8851517'],
 'chebi': 'CHEBI:15377',
 'bigg.metabolite': 'h2o',
 'biocyc': 'META:WATER'}

In [258]:
len(myCpds)

1776

In [259]:
# remove duplicated compounds
myCpds = remove_duplicate_cpd(myCpds)

In [260]:
len(myCpds)

1596

In [261]:
myCpds[100].__dict__

{'internal_id': '',
 'id': 'cpd00041',
 'name': 'L-Aspartate',
 'db_ids': {'epi_CA7-draft-gapseq': 'cpd00041',
  'metanetx.chemical': 'MNXM42',
  'inchikey': 'CKLJMWTZIZZHCS-REOHCLBHSA-M',
  'seed.compound': 'cpd00041',
  'reactome': ['R-ALL-113553', 'R-ALL-29448'],
  'chebi': ['CHEBI:132943',
   'CHEBI:17053',
   'CHEBI:21247',
   'CHEBI:40853',
   'CHEBI:40900',
   'CHEBI:40913',
   'CHEBI:40942',
   'CHEBI:6193',
   'CHEBI:22660',
   'CHEBI:29991',
   'CHEBI:13085',
   'CHEBI:21244',
   'CHEBI:29993',
   'CHEBI:29995',
   'CHEBI:35391',
   'CHEBI:22659',
   'CHEBI:29992'],
  'bigg.metabolite': 'asp__L',
  'biocyc': 'META:L-ASPARTATE'},
 'neutral_formula': 'C4H7NO4',
 'neutral_mono_mass': 133.037507,
 'charge': -1,
 'charged_formula': 'C4H6NO4',
 'SMILES': '',
 'inchi': '',
 'src_id': 'cpd00041'}

## Port reactions

In [262]:
[x.id for x in model.reactions[190].genes]

['gp_epi_CA7orNODE_2_length_232376_cov_418_830916_14225_12885',
 'gp_epi_CA7orNODE_5_length_168302_cov_485_831577_1217_6',
 'gp_epi_CA7orNODE_10_length_76182_cov_521_448065_23711_20556',
 'gp_epi_CA7orNODE_3_length_217361_cov_479_217393_60644_62014']

In [263]:
# port reactions, to include genes and enzymes
def port_reaction(R):
    new = Reaction()
    new.id = remove_compartment_by_split(R.id,'_')
    new.reactants = [remove_compartment_by_split(x.id,'_') for x in R.reactants]
    new.products = [remove_compartment_by_split(x.id,'_') for x in R.products]
    new.genes = [g.id for g in R.genes]
    new.enzymes = R.annotation.get('ec-code', [])
    if 'EX_' not in new.id:
        return new
test99 = port_reaction(model.reactions[250])

In [264]:
model.reactions[250].__dict__

{'_id': 'rxn00780_c0',
 'name': 'ATP:D-glyceraldehyde 3-phosphotransferase',
 'notes': {},
 '_annotation': {'sbo': ['SBO:0000167', 'SBO:0000176'],
  'ec-code': '2.7.1.28',
  'seed.reaction': ['rxn00780'],
  'kegg.reaction': 'R01059',
  'bigg.reaction': 'TRIOK',
  'biocyc': 'META:TRIOKINASE-RXN',
  'metanetx.reaction': 'MNXR104940'},
 '_gene_reaction_rule': 'gp_epi_CA7orNODE_1_length_636563_cov_463_302643_562348_560801',
 'subsystem': '',
 '_genes': {<Gene gp_epi_CA7orNODE_1_length_636563_cov_463_302643_562348_560801 at 0x7fb6658a91f0>},
 '_metabolites': {<Metabolite cpd00002_c0 at 0x7fb670225970>: -1.0,
  <Metabolite cpd00448_c0 at 0x7fb663b1fd60>: -1.0,
  <Metabolite cpd00067_c0 at 0x7fb665136d00>: 1.0,
  <Metabolite cpd00008_c0 at 0x7fb66a4b7f40>: 1.0,
  <Metabolite cpd00102_c0 at 0x7fb672055250>: 1.0},
 '_model': <Model epi_CA7 at 0x7fb66dc4c670>,
 '_lower_bound': 0.0,
 '_upper_bound': 1000.0}

In [265]:
## Reactions to port
myRxns = []
for R in model.reactions:
    temp_rxn = port_reaction(R) 
    if temp_rxn:
        myRxns.append(temp_rxn)

print(len(myRxns))

1878


In [266]:
myRxns = remove_duplicate_rxn(myRxns)

In [267]:
print(len(myRxns))

1742


In [268]:
myRxns[2].__dict__

{'azimuth_id': '',
 'id': 'rxn00006',
 'source': [],
 'version': '',
 'status': '',
 'reactants': ['cpd00025'],
 'products': ['cpd00001', 'cpd00007'],
 'enzymes': ['1.11.1.6', '1.11.1.21'],
 'genes': ['gp_epi_CA7orNODE_11_length_75003_cov_361_376185_10292_11803'],
 'pathways': [],
 'ontologies': [],
 'species': '',
 'compartments': [],
 'cell_types': [],
 'tissues': []}

# Port pathway

In [269]:
# pathways, using group as pathway. Other models may use subsystem etc.

def port_pathway(P):
    new = Pathway()
    new.id = P.id
    new.source = ['gapseq',]
    new.name = P.name
    new.list_of_reactions = [remove_compartment_by_split(x.id,'_') for x in P.members]
    return new

p = port_pathway(model.groups[12])

[p.id, p.name, p.list_of_reactions[:5]]

['subsys_ALKANEMONOX_PWY', 'ALKANEMONOX-PWY', ['rxn03975']]

In [270]:
## Pathways to port
myPathways = []
for P in model.groups:
    myPathways.append(port_pathway(P))

len(myPathways)

1099

## Collected data; now output

In [271]:
from datetime import datetime
today =  str(datetime.today()).split(" ")[0]

In [272]:
today

'2022-08-04'

In [273]:
note = """gapseq decompartmentalized, with genes and ECs."""

## metabolicModel to export
MM = MetabolicModel()
MM.id = f'az_{model_name}_{today}' #
MM.meta_data = {
            'species': model_name.split('-')[0],
            'version': '',
            'sources': [f'gapseq, retrieved {today}'], #
            'status': '',
            'last_update': today,  #
            'note': note,
        }

MM.list_of_reactions = [R.__dict__ for R in  myRxns] # since metDatamodel Don't have reaction name yet. I add here.
MM.list_of_compounds = [C.serialize() for C in myCpds]
MM.list_of_pathways = [C.serialize() for C in myPathways]

In [274]:
# check output
[
MM.list_of_pathways[2],
MM.list_of_reactions[:2],
MM.list_of_compounds[100:102],
]

[{'id': 'subsys_1CMET2_PWY',
  'name': '1CMET2-PWY',
  'list_of_reactions': ['rxn00684',
   'rxn00686',
   'rxn00692',
   'rxn00693',
   'rxn00907',
   'rxn00910',
   'rxn01211',
   'rxn01520',
   'rxn01601',
   'rxn01602',
   'rxn01653',
   'rxn04954',
   'rxn11007',
   'rxn12649',
   'rxn14120',
   'rxn15964',
   'rxn16206']},
 [{'azimuth_id': '',
   'id': 'rxn00001',
   'source': [],
   'version': '',
   'status': '',
   'reactants': ['cpd00001', 'cpd00012'],
   'products': ['cpd00009', 'cpd00067'],
   'enzymes': '3.6.1.1',
   'genes': ['gp_epi_CA7orNODE_12_length_74194_cov_378_988963_58423_59349'],
   'pathways': [],
   'ontologies': [],
   'species': '',
   'compartments': [],
   'cell_types': [],
   'tissues': []},
  {'azimuth_id': '',
   'id': 'rxn00003',
   'source': [],
   'version': '',
   'status': '',
   'reactants': ['cpd00011', 'cpd00668'],
   'products': ['cpd00067', 'cpd00020'],
   'enzymes': ['2.2.1.6', '4.1.3.18'],
   'genes': ['gp_epi_CA7orNODE_1_length_636563_cov_46

In [275]:
import pickle
import os

# Write pickle file
export_pickle(os.path.join(output_fdr,f'{MM.id}.pickle'), MM)

In [276]:
# Write json file
export_json(os.path.join(output_fdr,f'{MM.id}.json'), MM)

In [277]:
# Write dataframe 
import pandas as pd
export_table(os.path.join(output_fdr,f'{MM.id}_list_of_compounds.csv'),MM, 'list_of_compounds')
export_table(os.path.join(output_fdr,f'{MM.id}_list_of_reactions.csv'),MM, 'list_of_reactions')
export_table(os.path.join(output_fdr,f'{MM.id}_list_of_pathways.csv'),MM, 'list_of_pathways')

## Summary

This ports reactions, pathways and compounds. Gene and enzyme information is now included. 

The exported pickle can be re-imported and uploaded to Database easily.

This notebook, the pickle file and the JSON file go to GitHub repo (https://github.com/shuzhao-li/Azimuth).