# Parse SMDB for metabolite concentrations in blood for healthy adults

Serum metabolome database was downloaded on 3 May 2024 from https://serummetabolome.ca/downloads (release date 2021-10-24)

Database was parsed to retrieve normal metabolite concentration values in adult blood

Database can be cited as follows

Psychogios N, Hau DD, Peng J, Guo AC, Mandal R, Bouatra S, Sinelnikov I, Krishnamurthy R, Eisner R, Gautam B, Young N, Xia J, Knox C, Dong E, Huang P, Hollander Z, Pedersen TL, Smith SR, Bamforth F, Greiner R, McManus B, Newman JW, Goodfriend T, Wishart DS. The human serum metabolome. PLoS One. 2011 Feb 16;6(2):e16957.


In [1]:
from pathlib import Path
from lxml import etree
import pandas as pd
import re
import numpy as np

In [2]:
p = Path.cwd() / 'concentrations_from_serum'
xml_path = p / 'serum_metabolites_may2024.xml' # file downloaded from smdb
out_path = p / 'smdb_normal_blood_concentrations.csv'

## Run this only once to modify original xml file

In [3]:
# # Second line in downloaded xml is <hmdb xmlns="http://www.hmdb.ca"> but should be changed to <hmdb> otherwise parsing does not work

# with open(xml_path, 'r', encoding='utf-8') as xml_file:
#     lines = xml_file.readlines()
#     lines[1] = '<hmdb>\n'  # Change the second line to <hmdb>

# # Overwrite original file
# with open(xml_path, 'w', encoding='utf-8') as new_file:
#     new_file.writelines(lines)

## Get normal blood concentration values for all metabolites in the database


In [4]:
# Initialize an incremental parser for the XML file
context = etree.iterparse(xml_path, events=('end',), tag='metabolite')

# Iterate over metabolite elements
data = []
for _, metabolite in context:

    accession = metabolite.findtext('accession')
    name = metabolite.findtext('name')
    normal_concentrations = metabolite.find('normal_concentrations') #.find gives you access to the element
    
    # Process if normal_concentrations has entries
    if normal_concentrations is not None:
        
        # Iterate through the concentration child elements
        for concentration in normal_concentrations.findall('concentration'):
            biospecimen = concentration.findtext('biospecimen')
            subject_age = concentration.findtext('subject_age')
            subject_condition = concentration.findtext('subject_condition')
            pubmed_id = concentration.findtext('references/reference/pubmed_id')
            
            # Only process if the conditions match
            if (biospecimen == 'Blood' and subject_age == 'Adult (>18 years old)' and
                subject_condition == 'Normal'):
                # Get the concentration value
                concentration_value = concentration.findtext('concentration_value')
                concentration_units = concentration.findtext('concentration_units')
                
                # Collect the data if concentration_value is not None
                if concentration_value:
                    data_dict = {
                        'accession': accession,
                        'name':name,
                        'concentration_value': concentration_value,
                        'concentration_units': concentration_units,
                        'pubmed_id':pubmed_id
                    }
                    
                    data.append(data_dict)
    
    # Release resources
    metabolite.clear()
context = None

# Convert result to pandas dataframe
df = pd.DataFrame(data)

## Concentrations are entered in the database in different formats

- Measured value with normal range in parentheses: 50.0 (0.0-100.0)
- Measured value with standard deviation 46.5 +/- 1.1
- Normal range only: 8.0-80.0
- Measuread value 8.19
- Measured value with a number in parentheses meaning of which is unknown: 0.02 (0.03 +/- 0.01)

Here the values are extracted in three new columns to facilitate data sorting: measured_value, normal_range_min and normal_range_max

In [5]:
def parse_concentration_value(value: str):
    measured_value, normal_range_min, normal_range_max = None, None, None
    
    # Format: 50.0 (0.0-100.0)
    match = re.match(r'([0-9.]+)\s*\(\s*([0-9.]+)\s*-\s*([0-9.]+)\s*\)', value)
    if match:
        measured_value = float(match.group(1))
        normal_range_min = float(match.group(2))
        normal_range_max = float(match.group(3))
        return measured_value, normal_range_min, normal_range_max
    
    # Format: 46.5 ± 10.1
    if '+/-' in value and not ('(' in value or ')' in value):
        measured_value = float(value.split('+/-')[0].strip())
        return measured_value, None, None
    
    # Format: 8.00-80.0
    if '-' in value and not ('(' in value or ')' in value):
        normal_range_min, normal_range_max = map(float, value.split('-'))
        return None, normal_range_min, normal_range_max
    
    # Format: <0.1
    if value.startswith('<'):
        measured_value = float(value[1:].strip())
        return measured_value, None, None
    
    # Format: 43.57(9.97)
    match = re.match(r'([0-9.]+)\s*\(\s*([0-9.]+)\s*\)', value)
    if match:
        measured_value = float(match.group(1))
        return measured_value, None, None
    
    # Format: 43.57
    try:
        measured_value = float(value)
        return measured_value, None, None
    except ValueError:
        pass

    # Format - 0.02 (0.03 +/- 0.01)
    match = re.match(r'^([0-9.]+)\s*\([^)]*\)$', value)
    if match:
        measured_value = float(match.group(1))
        return measured_value, None, None

    return None, None, None

# Apply the parsing function and split results into three columns
df[['measured_value', 'normal_range_min', 'normal_range_max']] = df['concentration_value'].apply(parse_concentration_value).apply(pd.Series)

## Save result

In [6]:
df.to_csv(out_path, index=False)

## Paper revision: find concentrations of reference metabolites of interest

In [7]:
# Compunds used in the study
p_compounds = p / 'compounds_with_hmdb_id.csv'

# Subset for metabolites of interest
compounds = pd.read_csv(p_compounds)[['name_short', 'hmdb_primary']]
merged_df = pd.merge(compounds, df, how='inner', left_on='hmdb_primary', right_on='accession').drop(columns=['name','accession'])
merged_df.to_csv(p / "all_concentrations_in_blood_revision.csv", index=False)

In [8]:
# Calculate mean of reported values per metabolite (micromolar) and save
result = (
    merged_df
    .groupby(['name_short', 'hmdb_primary'])
    .agg(human_serum=('measured_value', 'mean'))
).rename(columns={'human_serum':'Human serum'})

result.to_csv(p / "revision_mean_concentrations_in_blood.csv")