# Curation of BiGG ids

Notebook for correcting misc. issues with BiGG dictionaries provided by @snmendoz in PR #188.

## 1. Splitting list into separate lists

We will start by splitting the original met/reaction files into 2 separates based on if they belong or not to BiGG:

In [1]:
# Function for splitting the file in two:
import csv
import os
def split_in_two(old_file_name, bigg_file_name):
    # Create list with BiGG ids:
    with open(bigg_file_name) as bigg_file:
        bigg_list = bigg_file.read().splitlines()
        
        
    # Split file in two:
    in_file_name = old_file_name.replace(".csv","_in.csv")
    out_file_name = old_file_name.replace(".csv","_newIDs.csv")
    with open(old_file_name) as old_file:
        with open(in_file_name, 'w', newline='') as in_file:
            with open(out_file_name, 'w', newline='') as out_file:
                old_reader = csv.reader(old_file, delimiter=',')
                in_writer = csv.writer(in_file, delimiter=',')
                out_writer = csv.writer(out_file, delimiter=',')
                for row in old_reader:
                    if row[1] in bigg_list:
                        in_writer.writerow([row[0], row[1]])
                    else:
                        out_writer.writerow([row[0], row[1]])

    # Replace file:
    os.remove(old_file_name)
    os.rename(in_file_name, old_file_name)

# Split both metabolite and reaction files:
split_in_two('../../data/databases/BiGGmetDictionary.csv', '../../data/databases/mets_already_in_bigg.txt')
split_in_two('../../data/databases/BiGGrxnDictionary.csv', '../../data/databases/rxns_already_in_bigg.txt')

## 2. Removing compartment info in met ids

BiGG ids for metabolites don't include any compartment info:

In [1]:
# Function for removing compartment info:
import csv
import os
def remove_comp_info(file_name):
    new_file_name = file_name.replace(".csv","_new.csv")
    with open(file_name) as old_file:
        with open(new_file_name, 'w', newline='') as new_file:
            bigg_reader = csv.reader(old_file, delimiter=',')
            bigg_writer = csv.writer(new_file, delimiter=',')
            for row in bigg_reader:
                if len(row[1].split("[")) == 2:
                    bigg_writer.writerow([row[0], row[1].split("[")[0]])
                else:
                    print("Warning: " + row[1])

    # Replace file:
    os.remove(file_name)
    os.rename(new_file_name, file_name)

#Remove compartment from both BiGG met files:
remove_comp_info('../../data/databases/BiGGmetDictionary.csv')
remove_comp_info('../../data/databases/BiGGmetDictionary_newIds.csv')

## 3. Lipid ids

We'll redefine these ids to be more systematic with the traditional way of annotating lipids:

In [1]:
# Load dictionary with traditional abbreviations:
import csv
lip_id_dict = {}
with open('../../data/databases/lipidAbbreviations.csv') as lip_id_file:
    lip_id_reader = csv.reader(lip_id_file, delimiter=';')
    for row in lip_id_reader:
        lip_id_dict[row[1]] = row[0]
print(lip_id_dict)

# Create new BiGG ids compliant with lipid standards:
import cobra
model = cobra.io.read_sbml_model("../../model/yeast-GEM.xml")
file_name = '../../data/databases/BiGGmetDictionary_newIDs.csv'
new_file_name = file_name.replace(".csv","_new.csv")
with open(file_name) as old_file:
    with open(new_file_name, 'w', newline='') as new_file:
        bigg_reader = csv.reader(old_file, delimiter=',')
        bigg_writer = csv.writer(new_file, delimiter=',')
        for row in bigg_reader:
            met = model.metabolites.get_by_id(row[0])
            new_met_id = row[1]
            for key, proper_id in lip_id_dict.items():
                if met.name.startswith(key):
                    new_met_id = met.name.split("[")[0]  # remove compartment info
                    new_met_id = new_met_id.replace(key, proper_id)  # replace name with compliant id
                    new_met_id = new_met_id.replace("backbone", "")  # remove "backbone" in generic species
                    new_met_id = new_met_id.replace("-bisphosphate", "bp")  # for pail species
                    new_met_id = new_met_id.replace("-phosphate", "p")  # for pail species
                    new_met_id = new_met_id.replace("phosphate", "p")  # for lcb species
                    if proper_id in ["cer", "ipc", "mipc", "mip2c"]:  # sphingolipids
                        new_met_id = new_met_id.replace("'", "a")  # "a" stands for "apostrophe"
                        new_met_id = new_met_id.replace("(C", "")  # redundant info
                        for index, let in enumerate("ABCD"):
                            new_met_id = new_met_id.replace(let, str(index+1))  # use the same standard for all
                    for num in "1234":
                        new_met_id = new_met_id.replace(num + "-", "")  # the order of chains is implied from the string
                    for bad_char in " ():,-":
                        new_met_id = new_met_id.replace(bad_char, "")  # to stay BiGG compliant
                    break
            bigg_writer.writerow([row[0], new_met_id])

# Rename file:
import os
os.remove(file_name)
os.rename(new_file_name, file_name)

{'1-monoglyceride': 'mag', 'diglyceride': 'dag', '1,2-diacylglycerol 3-diphosphate': 'dag2p', 'CDP-diacylglycerol': 'cdpdag', 'triglyceride': 'tag', 'phosphatidate': 'pa', 'phosphatidyl-L-serine': 'ps', '1-acylglycerophosphoserine': 'agps', 'phosphatidylethanolamine': 'pe', 'phosphatidyl-N-methylethanolamine': 'mmpe', 'phosphatidyl-N,N-dimethylethanolamine': 'dmpe', '1-acylglycerophosphoethanolamine': 'agpe', 'phosphatidylcholine': 'pc', '1-acylglycerophosphocholine': 'agpc', 'long-chain base': 'lcb', '1-phosphatidyl-1D-myo-inositol': 'pail', 'sn-2-acyl-1-lysophosphatidylinositol': 'lpi', 'ceramide': 'cer', 'inositol-P-ceramide': 'ipc', 'mannosylinositol phosphorylceramide': 'mipc', 'inositol phosphomannosylinositol phosphoceramide': 'mip2c', '3-(3-sn-phosphatidyl)-sn-glycerol 1-phosphate': 'pg1p', 'acylglycerone phosphate': 'agp', 'phosphatidylglycerol': 'pg', 'cardiolipin': 'cl', 'monolysocardiolipin': 'mlcl'}


# 4. SLIME rxn ids

To be consistent with the newly defined lipid ids:

In [1]:
# Load dictionary with ALL BiGG met ids:
import csv
def add_to_bigg_dict(file_name, met_bigg_dict):
    with open(file_name) as met_file:
        met_reader = csv.reader(met_file, delimiter=',')
        for row in met_reader:
            met_bigg_dict[row[0]] = row[1]
    return met_bigg_dict
met_bigg_dict = add_to_bigg_dict('../../data/databases/BiGGmetDictionary.csv', {})
met_bigg_dict = add_to_bigg_dict('../../data/databases/BiGGmetDictionary_newIDs.csv', met_bigg_dict)

In [2]:
# Create new BiGG ids for SLIME rxns compliant with lipid standards:
import cobra
model = cobra.io.read_sbml_model("../../model/yeast-GEM.xml")
file_name = '../../data/databases/BiGGrxnDictionary_newIDs.csv'
new_file_name = file_name.replace(".csv","_new.csv")
with open(file_name) as old_file:
    with open(new_file_name, 'w', newline='') as new_file:
        bigg_reader = csv.reader(old_file, delimiter=',')
        bigg_writer = csv.writer(new_file, delimiter=',')
        for row in bigg_reader:
            try:
                rxn = model.reactions.get_by_id(row[0])
                new_rxn_id = row[1]
                if rxn.name.endswith("SLIME rxn"):
                    met = rxn.reactants[0]
                    new_rxn_id = f"{met_bigg_dict[met.id]}SLIME{met.compartment}"
                bigg_writer.writerow([row[0], new_rxn_id])
            except KeyError:
                pass

# Rename file:
import os
os.remove(file_name)
os.rename(new_file_name, file_name)

## 5. Exchange rxns

The standard in BiGG is `EX_id_e`:

In [2]:
# Create new BiGG ids for exchange rxns:
import cobra
model = cobra.io.read_sbml_model("../../model/yeast-GEM.xml")
file_name = '../../data/databases/BiGGrxnDictionary_newIDs.csv'
new_file_name = file_name.replace(".csv","_new.csv")
with open(file_name) as old_file:
    with open(new_file_name, 'w', newline='') as new_file:
        bigg_reader = csv.reader(old_file, delimiter=',')
        bigg_writer = csv.writer(new_file, delimiter=',')
        for row in bigg_reader:
            try:
                rxn = model.reactions.get_by_id(row[0])
                new_rxn_id = row[1]
                if len(rxn.metabolites) == 1:
                    met = list(rxn.metabolites)[0]
                    if met.compartment == 'e':
                        new_rxn_id = f"EX_{met_bigg_dict[met.id]}_{met.compartment}"
                bigg_writer.writerow([row[0], new_rxn_id])
            except KeyError:
                pass

# Rename file:
import os
os.remove(file_name)
os.rename(new_file_name, file_name)

## 6. Transport rxns

For all transports of the type `met_a -> met_b` We will use as standard `METtab`:

In [2]:
# Tranlsator for compartments:
def translate_compartment(compartment):
    if compartment == "p":
        compartment = "x"
    elif compartment == "er":
        compartment = "r"
    elif compartment == "erm":
        compartment = "rm"
    elif compartment in ["c","e"]:
        compartment = ""  # cytosol/extracellular are typically not indicated in the BiGG id
    return compartment

# Create new BiGG ids for exchange rxns:
import cobra
model = cobra.io.read_sbml_model("../../model/yeast-GEM.xml")
file_name = '../../data/databases/BiGGrxnDictionary_newIDs.csv'
new_file_name = file_name.replace(".csv","_new.csv")
with open(file_name) as old_file:
    with open(new_file_name, 'w', newline='') as new_file:
        bigg_reader = csv.reader(old_file, delimiter=',')
        bigg_writer = csv.writer(new_file, delimiter=',')
        for row in bigg_reader:
            try:
                rxn = model.reactions.get_by_id(row[0])
                new_rxn_id = row[1]
                if len(rxn.metabolites) == 2:
                    sub = list(rxn.reactants)[0]
                    pro = list(rxn.products)[0]
                    sub_id = met_bigg_dict[sub.id]
                    pro_id = met_bigg_dict[pro.id]
                    if sub_id == pro_id:
                        sub_c = translate_compartment(sub.compartment)
                        pro_c = translate_compartment(pro.compartment)
                        new_rxn_id = f"{sub_id.upper()}t{sub_c}{pro_c}"
                bigg_writer.writerow([row[0], new_rxn_id])
            except KeyError:
                pass

# Rename file:
import os
os.remove(file_name)
os.rename(new_file_name, file_name)

## 7. Lipid rxn ids

The position of the chain is redundant information, e.g. `KIN11812181gm` can just be `KIN181181gm`

In [1]:
# Create new BiGG ids compliant with lipid standards:
import cobra
import csv
from copy import copy
from re import findall
model = cobra.io.read_sbml_model("../../model/yeast-GEM.xml")
file_name = '../../data/databases/BiGGrxnDictionary_newIDs.csv'
new_file_name = file_name.replace(".csv","_new.csv")
with open(file_name) as old_file:
    with open(new_file_name, 'w', newline='') as new_file:
        bigg_reader = csv.reader(old_file, delimiter=',')
        bigg_writer = csv.writer(new_file, delimiter=',')
        for row in bigg_reader:
            rxn = model.reactions.get_by_id(row[0])
            new_rxn_id = row[1]
            chains = findall('\(.*?\)',rxn.name)
            if len(chains) > 0:
                bad_number = ""
                good_number = ""
                for chain in chains:
                    good_chain = copy(chain)
                    for num in "1234":
                        good_chain = good_chain.replace(num + "-", "")  # the order of chains is implied from the string
                    for bad_char in " ():,-":
                        chain = chain.replace(bad_char, "")  # to stay BiGG compliant
                        good_chain = good_chain.replace(bad_char, "")  # to stay BiGG compliant
                    bad_number = bad_number + chain
                    good_number = good_number + good_chain
                new_rxn_id = new_rxn_id.replace(bad_number, good_number)
            bigg_writer.writerow([row[0], new_rxn_id])

# Rename file:
import os
os.remove(file_name)
os.rename(new_file_name, file_name)