In [None]:
import re
from pathlib import Path

import pandas as pd
import pronto
import yaml

from pronto import LiteralPropertyValue, Xref
from pronto import Synonym, SynonymData

# Load Efficiently the dataset

Requirements:
- Tabular data in CSV, TSV format.
- A YAML file with the data types

In [None]:
PATH_CONFIG_LOADING = Path("/home/ecarreno/SSC-Projects/b_REPOSITORIES/ontograph/notebooks/swisslipids_L.yaml")

In [None]:
def read_loading_configuration(filepath_configuration):
    with open(filepath_configuration, 'r') as f:
        config = yaml.safe_load(f)
    return config

def load_tabular_data(filepath, config_loading:dict=None):
    
    encoding = config_loading.get('encoding', 'utf-8')
    separator = config_loading.get('delimiter', '\t')
    dtypes = config_loading.get('data_types')

    dataset = pd.read_table(
        filepath_or_buffer=filepath,
        encoding=encoding,
        sep=separator,
        dtype=dtypes        
    )

    return dataset

In [None]:
# Read the configuration file
config_load = read_loading_configuration(filepath_configuration=PATH_CONFIG_LOADING)
config_load



In [None]:
# Pass parameters from the configuration file into the loader
df = load_tabular_data(
    filepath="../data/in/lipids.tsv.gz",
    config_loading=config_load,
)


In [None]:
df_lipids = pd.read_table(
    filepath_or_buffer="../data/in/lipids.tsv.gz",
    encoding="latin-1",
)

In [None]:
df_lipids.info()

In [None]:
df_lipids.head(200)

In [None]:
df_lipids.columns

In [None]:
df_lipids.head(50)

In [None]:
df_lipids[df_lipids["Lipid ID"] == 'SLM:000000339']

In [None]:
df_lipids["level"].unique()


## Generate Ontology object

In [None]:
MAPPING_COLUMN_NAMES = {
    'Lipid ID'      : "lipid_id",
    'Level'         : "level",
    'Name'          : "name",
    'Abbreviation*' : "abbreviation",
    'Synonyms*'     : "synonyms",    
    'Lipid class*'  : "lipid_classes_id", 
    'Parent'        : "parent_id", 
    'Components*'   : "components", 
    'SMILES (pH7.3)': "smiles_ph_7_3",
    'InChI (pH7.3)': "inchi_ph_7_3",
    'InChI key (pH7.3)': "inchi_key_ph_7_3",
    'Formula (pH7.3)': "formula_ph_7_3",
    'Charge (pH7.3)': "charge_ph_7_3",
    'Mass (pH7.3)': "mass_ph_7_3",
    'Exact Mass (neutral form)': "exact_mass_neutral_form",
    'Exact m/z of [M.]+' : "exact_mz_m_radical_cation",
    'Exact m/z of [M+H]+': "exact_mz_m_h_pos",
    'Exact m/z of [M+K]+ ': "exact_mz_m_k_pos",
    'Exact m/z of [M+Na]+': "exact_mz_m_na_pos",
    'Exact m/z of [M+Li]+': "exact_mz_m_li_pos",
    'Exact m/z of [M+NH4]+': "exact_mz_m_nh4_pos",
    'Exact m/z of [M-H]-': "exact_mz_m_h_neg",
    'Exact m/z of [M+Cl]-': "exact_mz_m_cl_neg",
    'Exact m/z of [M+OAc]- ': "exact_mz_m_oac_neg",
    'CHEBI': "chebi_id",
    'LIPID MAPS': "lipid_maps_id",
    'HMDB': "hmdb_id",
    'MetaNetX': "metanetx_id",
    'PMID': "pubmed_id"
}

METADATA_ONTOLOGY = {
    "ontology": "swisslipids",  # This sets the ontology name
    "title": "SwissLipids Ontology",
    "description": "Ontology representing SwissLipids data, including lipid IDs, classes, and parent relationships.",
    "version": "1.0.0",
    "creators": ["SIB Swiss Institute of Bioinformatics."],
    "license": "CC-BY 4.0",
    "created": "2025-08-29",
}


COLUMNS_FOR_RELATIONSHIPS = {
    "term_id": "lipid_id",
    "parent_id": "parent_id",
    "class_id": "lipid_classes_id"
}

# Property name, column name, data type
ALL_PROPERTIES = [
    # ---- Core Identifiers and Relationships
    ('Lipid ID', 'lipid_id', 'id', None),
    ('Name', 'name', 'name', None),
    ('Synonyms*', 'synonyms', 'synonym', None),
    ('Parent', 'parent_id', 'parent', None),
    ('Lipid class*', 'lipid_classes_id', 'class', None),

    # ---- General Annotations
    ('Level', 'level', 'annotation', 'xsd:string'),
    ('Abbreviation*', 'abbreviation', 'annotation', 'xsd:string'),
    ('Components*', 'components', 'annotation', 'xsd:string'),

    # ---- Chemical Identifier Annotations
    ('SMILES (pH7.3)', 'smiles_ph_7_3', 'annotation', 'xsd:string'),
    ('InChI (pH7.3)', 'inchi_ph_7_3', 'annotation', 'xsd:string'),
    ('InChI key (pH7.3)', 'inchi_key_ph_7_3', 'annotation', 'xsd:string'),
    ('Formula (pH7.3)', 'formula_ph_7_3', 'annotation', 'xsd:string'),

    # ---- Physical Property Annotations
    ('Charge (pH7.3)', 'charge_ph_7_3', 'annotation', 'xsd:integer'),
    ('Mass (pH7.3)', 'mass_ph_7_3', 'annotation', 'xsd:float'),
    ('Exact Mass (neutral form)', 'exact_mass_neutral_form', 'annotation', 'xsd:float'),

    # ---- Mass Spectrometry Annotations
    ('Exact m/z of [M.]+', 'exact_mz_m_radical_cation', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+H]+', 'exact_mz_m_h_pos', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+K]+ ', 'exact_mz_m_k_pos', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+Na]+', 'exact_mz_m_na_pos', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+Li]+', 'exact_mz_m_li_pos', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+NH4]+', 'exact_mz_m_nh4_pos', 'annotation', 'xsd:float'),
    ('Exact m/z of [M-H]-', 'exact_mz_m_h_neg', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+Cl]-', 'exact_mz_m_cl_neg', 'annotation', 'xsd:float'),
    ('Exact m/z of [M+OAc]- ', 'exact_mz_m_oac_neg', 'annotation', 'xsd:float'),

    # ---- Cross-References (Xrefs)
    ('CHEBI', 'chebi_id', 'xref', None),
    ('LIPID MAPS', 'lipid_maps_id', 'xref', None),
    ('HMDB', 'hmdb_id', 'xref', None),
    ('MetaNetX', 'metanetx_id', 'xref', None),
    ('PMID', 'pubmed_id', 'xref', None),
]


def export_ontology(file_path, ontology, ontology_serializer:str="obo"):
    with open(file_path, "wb") as f:
        ontology.dump(f, format=ontology_serializer)


def add_ontology_metadata(ontology, metadata):
    for key, value in metadata.items():
        setattr(ontology.metadata, key, value)


def preprocess_dataset(dataset):

    # Rename Columns
    dataset.rename(columns=MAPPING_COLUMN_NAMES, inplace=True)
    print(dataset.columns)

    # Strip values in columns (vectorized)
    obj_cols = dataset.select_dtypes(include="object").columns
    dataset[obj_cols] = dataset[obj_cols].apply(lambda col: col.str.strip())
    return dataset

def generate_ontology_from_table(dataset: pd.DataFrame, id_columns: dict, metadata_ontology:dict):

    # Preprocess the dataset
    dataset = preprocess_dataset(dataset)

    # Create Ontology
    ontology = pronto.Ontology()
    
    # Add ontology metadata
    add_ontology_metadata(ontology=ontology, metadata=metadata_ontology)

    # --- 1. Collect all unique IDs from all relevant columns ---
    term_col = dataset[id_columns["term_id"]].dropna().astype(str)
    class_col = dataset[id_columns["class_id"]].dropna().astype(str)
    parent_col = dataset[id_columns["parent_id"]].dropna().astype(str)

    all_terms_id = set(term_col.unique())
    all_classes_id = {term.strip() for classes in class_col.str.split("|") for term in classes if term.strip()}
    all_parents_id = {term.strip() for term in parent_col if term.strip()}
    
    all_unique_ids = all_terms_id | all_classes_id | all_parents_id
    
    print(f"Total unique terms to create: {len(all_unique_ids)}")

    # --- 2. Create all terms ONCE without any properties ---
    for term_id in all_unique_ids:
        ontology.create_term(term_id)

    # --- 3. Add properties and relationships
    
    
    return ontology

In [None]:
swissontology = generate_ontology_from_table(dataset=df_lipids,
                                             id_columns=COLUMNS_FOR_RELATIONSHIPS,
                                             metadata_ontology=METADATA_ONTOLOGY)

In [None]:
len(list(swissontology.terms())) # 779249

In [None]:
export_ontology(file_path="../data/out/lipids.obo", ontology=swissontology, ontology_serializer="obo")

In [None]:
df_lipids.columns

## Generate Mapping File Object

In [None]:
def fix_chebi(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^CHEBI:\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"CHEBI:{digits[0]}" if digits else val_str

def fix_lipid_maps(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^LM[A-Z0-9]+$", val_str):
        return val_str
    lm_match = re.search(r"(LM[A-Z0-9]+)", val_str)
    return lm_match.group(1) if lm_match else val_str

def fix_hmdb(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^HMDB\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"HMDB{digits[0]}" if digits else val_str

def fix_metanetx(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^MNXM\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"MNXM{digits[0]}" if digits else val_str

def fix_crossref_columns(df):
    df = df.copy()  # Ensure we work on a copy
    if "CHEBI" in df.columns:
        df.loc[:, "CHEBI"] = df["CHEBI"].apply(fix_chebi)
    if "LIPID_MAPS" in df.columns:
        df.loc[:, "LIPID_MAPS"] = df["LIPID_MAPS"].apply(fix_lipid_maps)
    if "HMDB" in df.columns:
        df.loc[:, "HMDB"] = df["HMDB"].apply(fix_hmdb)
    if "MetaNetX" in df.columns:
        df.loc[:, "MetaNetX"] = df["MetaNetX"].apply(fix_metanetx)
    return df

def generate_mapping_file_from_table(dataset, columns: list, new_name_columns:list):
    dataframe = dataset[columns]
    dataframe = fix_crossref_columns(dataframe)
    dataframe.columns = new_name_columns
    return dataframe

In [None]:
list_columns=["Lipid_ID", "CHEBI", "LIPID_MAPS", "HMDB", "MetaNetX"]
new_name_columns = [
    "swiss_lipid_id",
    "chebi_id",
    "lipid_maps_id",
    "hmdb_id",
    "metanetx_id"
]

mapping_file = generate_mapping_file_from_table(df_lipids, list_columns, new_name_columns)

In [None]:
mapping_file.head()

In [None]:
df_lipids[df_lipids["Lipid_class_"] == "SLM:000001080"]

In [None]:
mapping_file[mapping_file["chebi_id"] == "CHEBI:33234"]

In [None]:
def translate_ids(mapping_df, source_format: str, target_format: str, source_ids: list) -> list:
    """
    Translate a list of identifiers from source_format to target_format using mapping_df.
    Prints IDs that fail and shows a summary at the end.

    Args:
        mapping_df (pd.DataFrame): The mapping table.
        source_format (str): The column name for the source format.
        target_format (str): The column name for the target format.
        source_ids (list): List of identifiers to translate.

    Returns:
        list: List of translated identifiers (None for failed).
    """
    results = []
    failed = []
    for source_id in source_ids:
        try:
            row = mapping_df[mapping_df[source_format] == source_id]
            if row.empty:
                print(f"ID not found: {source_id}")
                results.append(None)
                failed.append(source_id)
            else:
                target_id = row.iloc[0][target_format]
                if pd.isna(target_id) or not isinstance(target_id, str) or not target_id.strip():
                    print(f"Target format missing for: {source_id}")
                    results.append(None)
                    failed.append(source_id)
                else:
                    results.append(target_id)
        except Exception as e:
            print(f"Error processing {source_id}: {e}")
            results.append(None)
            failed.append(source_id)
    print(f"\nSummary: {len(source_ids) - len(failed)} IDs converted, {len(failed)} failed.")
    if failed:
        print("Failed IDs:", failed)
    return results

In [None]:
swiss_lipid_ids = [
    "SLM:000000042",
    "SLM:000001080",
    "SLM:000000421",
    "SLM:000000651",
]

chebi_ids = translate_ids(mapping_df=mapping_file,
                          source_format="swiss_lipid_id",
                          #target_format="chebi_id",
                          target_format="hmdb_id",
                          source_ids=swiss_lipid_ids)

In [None]:
chebi_ids

In [None]:
import pronto

In [None]:
onto = pronto.Ontology("../data/out/lipids.obo")

In [None]:
for rel in onto.relationships():
    print(rel)

In [None]:
term = onto.get_term("SLM:000755122")  # replace with your term ID
term.annotations
