In [None]:
import re

import pandas as pd
import pronto

from pronto import LiteralPropertyValue, Xref
from pronto import Synonym, SynonymData

In [None]:
df_lipids = pd.read_table(
    filepath_or_buffer="../data/in/lipids.tsv.gz",
    encoding="latin-1",
)

In [None]:
df_lipids.info()

In [None]:
df_lipids.head(50)

In [None]:
df_lipids.columns = df_lipids.columns.map(lambda x: ''.join(c if c.isalnum() or c == '_' else '_' for c in str(x)))

In [None]:
df_lipids.columns

In [None]:
df_lipids.head(50)

In [None]:
df_lipids[df_lipids["Lipid_ID"] == 'SLM:000000339']

In [None]:
df_lipids["Level"].unique()

## Generate Ontology object

In [None]:
METADATA_ONTOLOGY = {
    "ontology": "swisslipids",  # This sets the ontology name
    "title": "SwissLipids Ontology",
    "description": "Ontology representing SwissLipids data, including lipid IDs, classes, and parent relationships.",
    "version": "1.0.0",
    "creators": ["SIB Swiss Institute of Bioinformatics."],
    "license": "CC-BY 4.0",
    "created": "2025-08-29",
}


ID_COLUMNS = {
    "term_id": "Lipid_ID",
    "parent_id": "Parent",
    "class_id": "Lipid_class_"
}

def write_to_obo(file_path, ontology):
    with open(file_path, "wb") as f:
        ontology.dump(f, format="obo")


def add_ontology_metadata(ontology, metadata):
    for key, value in metadata.items():
        setattr(ontology.metadata, key, value)


def preprocess_dataset(dataset):
    # Strip values in columns (vectorized)
    obj_cols = dataset.select_dtypes(include="object").columns
    dataset[obj_cols] = dataset[obj_cols].apply(lambda col: col.str.strip())
    return dataset

METADATA_ONTOLOGY = {
    "ontology": "swisslipids",  # This sets the ontology name
    "title": "SwissLipids Ontology",
    "description": "Ontology representing SwissLipids data, including lipid IDs, classes, and parent relationships.",
    "version": "1.0.0",
    "creators": ["SIB Swiss Institute of Bioinformatics."],
    "license": "CC-BY 4.0",
    "created": "2025-08-29",
}


ID_COLUMNS = {
    "term_id": "Lipid_ID",
    "parent_id": "Parent",
    "class_id": "Lipid_class_"
}

def write_to_obo(file_path, ontology):
    with open(file_path, "wb") as f:
        ontology.dump(f, format="obo")


def add_ontology_metadata(ontology, metadata):
    for key, value in metadata.items():
        setattr(ontology.metadata, key, value)


def preprocess_dataset(dataset):
    # Strip values in columns (vectorized)
    obj_cols = dataset.select_dtypes(include="object").columns
    dataset[obj_cols] = dataset[obj_cols].apply(lambda col: col.str.strip())
    return dataset

def generate_ontology_from_table(dataset: pd.DataFrame, id_columns: dict, metadata:dict):

    # Preprocess the dataset
    dataset = preprocess_dataset(dataset)

    # Create Ontology
    ontology = pronto.Ontology()
    
    # Add ontology metadata
    add_ontology_metadata(ontology=ontology, metadata=metadata)

    # --- 1. Collect all unique IDs from all relevant columns ---
    term_col = dataset[id_columns["term_id"]].dropna().astype(str)
    class_col = dataset[id_columns["class_id"]].dropna().astype(str)
    parent_col = dataset[id_columns["parent_id"]].dropna().astype(str)

    all_terms_id = set(term_col.unique())
    all_classes_id = {term.strip() for classes in class_col.str.split("|") for term in classes if term.strip()}
    all_parents_id = {term.strip() for term in parent_col if term.strip()}
    
    all_unique_ids = all_terms_id | all_classes_id | all_parents_id
    
    print(f"Total unique terms to create: {len(all_unique_ids)}")

    # --- 2. Create all terms ONCE without any properties ---
    for term_id in all_unique_ids:
        ontology.create_term(term_id)

    # --- REFACTOR: Define annotation columns in a list for maintainability ---
    annotation_columns = [
        ("Level", "Level", "xsd:string"),
        ("Components", "Components_"),
        ("SMILES__pH7_3", "SMILES__pH7_3_"),
        ("InChI__pH7_3", "InChI__pH7_3_"),
        ("InChI_key__pH7_3", "InChI_key__pH7_3_"),
        ("Formula__pH7_3", "Formula__pH7_3_"),
        ("Charge__pH7_3", "Charge__pH7_3_", "xsd:integer"),
        ("Mass__pH7_3", "Mass__pH7_3_", "xsd:float"),
        ("Exact_Mass__neutral_form", "Exact_Mass__neutral_form_", "xsd:float"),
        ("Exact_m_z_of__M", "Exact_m_z_of__M___", "xsd:float"),
        ("Exact_m_z_of__M_H__", "Exact_m_z_of__M_H__", "xsd:float"),
        ("Exact_m_z_of__M_K___", "Exact_m_z_of__M_K___", "xsd:float"),
        ("Exact_m_z_of__M_Na__", "Exact_m_z_of__M_Na__", "xsd:float"),
        ("Exact_m_z_of__M_Li__", "Exact_m_z_of__M_Li__", "xsd:float"),
        ("Exact_m_z_of__M_NH4__", "Exact_m_z_of__M_NH4__", "xsd:float"),
        ("Exact_m_z_of__M_Cl__", "Exact_m_z_of__M_Cl__", "xsd:float"),
        ("Exact_m_z_of__M_OAc___", "Exact_m_z_of__M_OAc___", "xsd:float"),
        ("Abbreviation", "Abbreviation_")
    ]

    xref_columns = ["CHEBI", "LIPID_MAPS", "HMDB", "MetaNetX"]


    # --- 3. Iterate through the data to add properties and relationships ---
    for row in dataset.itertuples(index=False):
        term_id = getattr(row, id_columns.get("term_id"))
        
        if pd.isna(term_id):
            continue

        term = ontology.get_term(term_id)

        if pd.notna(getattr(row, "Name")):
            term.name = getattr(row, "Name")
        
        synonyms = getattr(row, "Synonyms_")
        if pd.notna(synonyms):
            for syn_text in str(synonyms).split("|"):
                syn_text = syn_text.strip()
                if syn_text:
                    term.add_synonym(syn_text, scope="RELATED")

        # --- REFACTOR: Loop through annotation columns and add them with correct data types ---
        for item in annotation_columns:
            prop_name, col_name = item[0], item[1]
            xsd_type = item[2] if len(item) > 2 else None
            
            value = getattr(row, col_name)
            if pd.notna(value):
                if xsd_type:
                    term.annotations.add(LiteralPropertyValue(prop_name, str(value), datatype=xsd_type))
                else:
                    term.annotations.add(LiteralPropertyValue(prop_name, str(value)))

        new_xrefs = set(term.xrefs)
        for col_name in xref_columns:
            value = getattr(row, col_name)
            if pd.notna(value):
                # Xrefs can also be pipe-separated
                for xref_id in str(value).split('|'):
                    xref_id = xref_id.strip()
                    if xref_id:
                        new_xrefs.add(Xref(xref_id))
        if new_xrefs:
            term.xrefs = frozenset(new_xrefs)

        # Add parent relationship
        parent_id = getattr(row, id_columns.get("parent_id"))
        if pd.notna(parent_id):
            parent_term = ontology.get_term(parent_id.strip())
            parent_term.subclasses().add(term)

        # Add class relationship
        classes_id = getattr(row, id_columns.get("class_id"))
        if pd.notna(classes_id):
            for item in str(classes_id).split("|"):
                class_term_id = item.strip()
                if class_term_id:
                    class_term = ontology.get_term(class_term_id)
                    class_term.subclasses().add(term)

    return ontology

In [None]:
# 'Lipid_ID',
#'Level',
#'Name',
#'Abbreviation_',
#'Synonyms_',
#'Lipid_class_', 
#'Parent',
'Components_',
'SMILES__pH7_3_',
'InChI__pH7_3_',
'InChI_key__pH7_3_',
'Formula__pH7_3_',
'Charge__pH7_3_',
'Mass__pH7_3_',
'Exact_Mass__neutral_form_',
'Exact_m_z_of__M___',
'Exact_m_z_of__M_H__',
'Exact_m_z_of__M_K___',
'Exact_m_z_of__M_Na__',
'Exact_m_z_of__M_Li__',
'Exact_m_z_of__M_NH4__',
'Exact_m_z_of__M_H__',
'Exact_m_z_of__M_Cl__',
'Exact_m_z_of__M_OAc___',
'CHEBI',
'LIPID_MAPS',
'HMDB',
'MetaNetX',
'PMID'

In [None]:
swissontology = generate_ontology_from_table(dataset=df_lipids,
                                             id_columns=ID_COLUMNS,
                                             metadata=METADATA_ONTOLOGY)

In [None]:
len(list(swissontology.terms())) # 779249

In [None]:
write_to_obo(file_path="../data/out/lipids.obo", ontology=swissontology)

In [None]:
df_lipids.columns

## Generate Mapping File Object

In [None]:
def fix_chebi(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^CHEBI:\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"CHEBI:{digits[0]}" if digits else val_str

def fix_lipid_maps(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^LM[A-Z0-9]+$", val_str):
        return val_str
    lm_match = re.search(r"(LM[A-Z0-9]+)", val_str)
    return lm_match.group(1) if lm_match else val_str

def fix_hmdb(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^HMDB\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"HMDB{digits[0]}" if digits else val_str

def fix_metanetx(val):
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.match(r"^MNXM\d+$", val_str):
        return val_str
    digits = re.findall(r"\d+", val_str)
    return f"MNXM{digits[0]}" if digits else val_str

def fix_crossref_columns(df):
    df = df.copy()  # Ensure we work on a copy
    if "CHEBI" in df.columns:
        df.loc[:, "CHEBI"] = df["CHEBI"].apply(fix_chebi)
    if "LIPID_MAPS" in df.columns:
        df.loc[:, "LIPID_MAPS"] = df["LIPID_MAPS"].apply(fix_lipid_maps)
    if "HMDB" in df.columns:
        df.loc[:, "HMDB"] = df["HMDB"].apply(fix_hmdb)
    if "MetaNetX" in df.columns:
        df.loc[:, "MetaNetX"] = df["MetaNetX"].apply(fix_metanetx)
    return df

def generate_mapping_file_from_table(dataset, columns: list, new_name_columns:list):
    dataframe = dataset[columns]
    dataframe = fix_crossref_columns(dataframe)
    dataframe.columns = new_name_columns
    return dataframe

In [None]:
list_columns=["Lipid_ID", "CHEBI", "LIPID_MAPS", "HMDB", "MetaNetX"]
new_name_columns = [
    "swiss_lipid_id",
    "chebi_id",
    "lipid_maps_id",
    "hmdb_id",
    "metanetx_id"
]

mapping_file = generate_mapping_file_from_table(df_lipids, list_columns, new_name_columns)

In [None]:
mapping_file.head()

In [None]:
df_lipids[df_lipids["Lipid_class_"] == "SLM:000001080"]

In [None]:
mapping_file[mapping_file["chebi_id"] == "CHEBI:33234"]

In [None]:
def translate_ids(mapping_df, source_format: str, target_format: str, source_ids: list) -> list:
    """
    Translate a list of identifiers from source_format to target_format using mapping_df.
    Prints IDs that fail and shows a summary at the end.

    Args:
        mapping_df (pd.DataFrame): The mapping table.
        source_format (str): The column name for the source format.
        target_format (str): The column name for the target format.
        source_ids (list): List of identifiers to translate.

    Returns:
        list: List of translated identifiers (None for failed).
    """
    results = []
    failed = []
    for source_id in source_ids:
        try:
            row = mapping_df[mapping_df[source_format] == source_id]
            if row.empty:
                print(f"ID not found: {source_id}")
                results.append(None)
                failed.append(source_id)
            else:
                target_id = row.iloc[0][target_format]
                if pd.isna(target_id) or not isinstance(target_id, str) or not target_id.strip():
                    print(f"Target format missing for: {source_id}")
                    results.append(None)
                    failed.append(source_id)
                else:
                    results.append(target_id)
        except Exception as e:
            print(f"Error processing {source_id}: {e}")
            results.append(None)
            failed.append(source_id)
    print(f"\nSummary: {len(source_ids) - len(failed)} IDs converted, {len(failed)} failed.")
    if failed:
        print("Failed IDs:", failed)
    return results

In [None]:
swiss_lipid_ids = [
    "SLM:000000042",
    "SLM:000001080",
    "SLM:000000421",
    "SLM:000000651",
]

chebi_ids = translate_ids(mapping_df=mapping_file,
                          source_format="swiss_lipid_id",
                          #target_format="chebi_id",
                          target_format="hmdb_id",
                          source_ids=swiss_lipid_ids)

In [None]:
chebi_ids

In [None]:
import pronto

In [None]:
onto = pronto.Ontology("../data/out/lipids.obo")

In [None]:
for rel in onto.relationships():
    print(rel)

In [None]:
term = onto.get_term("SLM:000755122")  # replace with your term ID
term.annotations
