In [None]:
import re
import yaml
from pathlib import Path

import pandas as pd
import pronto
from pronto import LiteralPropertyValue, Xref
from pronto import Synonym, SynonymData

# Load Efficiently the dataset

Requirements:
- Tabular data in CSV, TSV format.
- A YAML file with the data types

In [None]:
PATH_DATASET= "../data/in/lipids.tsv.gz"
#PATH_DATASET= "../data/in/lipids_dummy.tsv"
PATH_CONFIG_LOADING = Path("./swisslipids_L.yaml")

## INGESTION STAGE

In [None]:
def read_global_settings(filepath_configuration_file):
    with open(filepath_configuration_file, 'r') as f:
        global_settings = yaml.safe_load(f)
    return global_settings

def load_tabular_data(filepath, config_loading:dict=None):
    
    encoding = config_loading.get('encoding', 'utf-8')
    separator = config_loading.get('delimiter', '\t')
    dtypes = config_loading.get('schema')
    chunksize = config_loading.get('chunksize')

    chunk_iterator = pd.read_table(
        filepath_or_buffer=filepath,
        encoding=encoding,
        sep=separator,
        dtype=dtypes,
        chunksize=chunksize
    )

    return chunk_iterator

In [None]:
# Read the configuration file
config_load = read_global_settings(filepath_configuration_file=PATH_CONFIG_LOADING)

INGESTION_PARAMS = config_load.get("ingestion")
TRANSFORMATION_PARAMS = config_load.get("transformation")
SERVING_PARAMS = config_load.get("serving")

In [None]:
# Load the dataset
df_lipids = load_tabular_data(
    filepath=PATH_DATASET,
    config_loading=INGESTION_PARAMS
)

In [None]:
df_lipids.head(50)

In [None]:
df_lipids.info()

### EDA

In [None]:
def find_special_characters(dataset_column):
    unique_values = dataset_column.dropna()

    special_chars = set()
    for val in unique_values:
        special_chars.update(re.findall(r'[^a-zA-Z0-9]', str(val)))

    return special_chars

**Column:** `Lipid ID`

- Show the special characters in this column
- Number of unique elements

In [None]:
special_chars = find_special_characters(df_lipids["Lipid ID"])
print(special_chars)

In [None]:
print(f"Unique elements: {df_lipids['Lipid ID'].nunique(dropna=True)}")

**Column:** `Level`

- Count the distinct values (categories) in this column
- Display those row without an assigned category
- Show the especial characters in this columns

In [None]:
df_lipids["Level"].dropna().unique()

In [None]:
df_lipids[df_lipids["Level"].isna()]

In [None]:
special_chars = find_special_characters(df_lipids["Level"])
print(special_chars)

**Column:** `Lipid class *`

- Show the especial characters in this columns
- Show some examples of rows containing the suspected separator character (for instance, pipes)
- Count the number of unique classes
- Count the number of IDs that belong to `Lipid Class*` and are not present in `Lipid ID`

In [None]:
special_chars = find_special_characters(df_lipids["Lipid class*"])
print(special_chars)

In [None]:
df_lipids[df_lipids["Lipid class*"].str.contains(r'\|', na=False)]

In conclusion:
- `:` : this character is used as part of the Swiss Lipids identifiers i.e., `SLM:000389698`.
- `|` : this character is used to separate elements of lists i.e., `SLM:000389698 | SLM:000399707`.
- ` ` : the space character is used for human readability, they should be removed when processing individual cells.

In [None]:
elements = (
    df_lipids["Lipid class*"]
    .dropna()
    .str.split('|')
    .explode()
    .str.strip()
    .loc[lambda x: x != '']
    .unique()
)

unique_classes = set(elements)

print(f"Unique classes in the column: {len(unique_classes)}")

In [None]:
lipid_ids = set(df_lipids["Lipid ID"].dropna().astype(str).str.strip().unique())
for item in unique_classes:
    if item.strip() not in lipid_ids:
        print(f"{item} not in LIPID ID")

**Column:** `Parent`

- Show the especial characters in this
- Count the number of unique classes
- Count the number of IDs that belong to this columns and are not present in `Lipid ID`

In [None]:
special_chars = find_special_characters(df_lipids["Parent"])
print(special_chars)

In [None]:
elements = (
    df_lipids["Parent"]
    .dropna()
    .str.strip()
    .loc[lambda x: x != '']
    .unique()
)

unique_classes = set(elements)

print(f"Unique Parents in the column: {len(unique_classes)}")

In [None]:
lipid_ids = set(df_lipids["Lipid ID"].dropna().astype(str).unique())
parent_ids = set(df_lipids["Parent"].dropna().astype(str).unique())
unique_parents_not_in_lipid_ids = parent_ids - lipid_ids
print(f"Unique parents not in Lipid id: {len(unique_parents_not_in_lipid_ids)}")

## TRANSFORMATION STAGE

### Generate Ontology

In [None]:
def rename_columns(dataset, dict_columns):
    # Rename Columns
    dataset.columns = dataset.columns.str.strip()
    dataset.rename(columns=dict_columns, inplace=True)
    print(dataset.columns)

def strip_columns(dataset):
    obj_cols = dataset.select_dtypes(include="string").columns
    dataset[obj_cols] = dataset[obj_cols].apply(lambda col: col.str.strip())

def add_prefixes(dataset: pd.DataFrame, dict_prefixes: dict) -> pd.DataFrame:
    """
    Adds prefixes to specified columns in a DataFrame if they are not already present.

    Args:
        dataset (pd.DataFrame): The input DataFrame.
        dict_prefixes (dict): A dictionary mapping column names to their prefixes.

    Returns:
        pd.DataFrame: The DataFrame with prefixes applied where needed.
    """
    for column, prefix in dict_prefixes.items():
        if column in dataset.columns:
            # Ensure the column is treated as a string for the check
            col_str = dataset[column].astype(str)
            
            # Create a mask for non-null values that do not already start with the prefix
            mask = dataset[column].notna() & ~col_str.str.startswith(prefix, na=False)
            
            # Apply the prefix only to the selected rows
            dataset.loc[mask, column] = prefix + dataset.loc[mask, column].astype(str)
            
    return dataset

def create_ontology():
    ontology = pronto.Ontology()
    return ontology

def add_ontology_metadata(ontology, metadata):
    for key, value in metadata.items():
        setattr(ontology.metadata, key, value)

def preprocess_dataset(dataset):

    # Rename Columns
    rename_columns(dataset, MAPPING_COLUMN_NAMES)

    # Strip values in columns (vectorized)
    strip_columns(dataset)

    # Add prefixes in certain columns
    dataset = add_prefixes(dataset=dataset, dict_prefixes=PREFIXES_MAPPING_IDS)

    return dataset


def extract_all_ids(dataset, mapping_column_ids):
    
    superset_unique_ids = set()
    print(mapping_column_ids)

    for k, v in mapping_column_ids.items():
        column_name = v["name"]
        separator = v["separator"]

        if separator is None:
            #print(k)
            temporal_ids = dataset[column_name].dropna()
        else:
            temporal_series = dataset[column_name].dropna().str.split(separator).explode()
            temporal_ids = temporal_series.str.strip().loc[lambda x: x != ''].unique()

        superset_unique_ids.update(set(temporal_ids))

    print(f"Total unique terms to create: {len(superset_unique_ids)}")
    return superset_unique_ids
    
def create_all_terms(ontology, set_of_ids):
    terms_dict = {term_id: ontology.create_term(term_id) for term_id in set_of_ids}

    return terms_dict

def add_properties(dataset, map_properties, terms_dict, column_id):
        # # 3.1 Add all properties (name, synonym, annotation, xref)
        print(f"TRACK: {column_id}")
        for row in dataset.itertuples(index=False):
            term_id = getattr(row, column_id)
            
            # Add name property
            terms_dict[term_id].name = getattr(row, map_properties["name"])
            
            # Add annotations properties
            for column_name, datatype in ALL_PROPERTIES["annotation"].items():
                prop = getattr(row, column_name)
                if pd.notna(prop):
                    literal_value = LiteralPropertyValue(column_name, str(prop), datatype=datatype["datatype"])
                    terms_dict[term_id].annotations.add(literal_value)

            # Add references
            for reference in map_properties["references"]:
                ref = getattr(row, reference)
                if pd.notna(ref):
                    literal_value = LiteralPropertyValue(reference, str(ref), datatype="xsd:string")
                    terms_dict[term_id].annotations.add(literal_value)

def add_parent_relationships(term_definitions, id_columns, terms_dict):
    """Adds 'is_a' relationships from terms to their parents."""
    parent_col_name = id_columns["parent_id"]["name"]
    parent_series = term_definitions[parent_col_name].dropna()
    for child_id, parent_id in parent_series.items():
        child_term = terms_dict.get(child_id)
        parent_term = terms_dict.get(parent_id)
        if child_term and parent_term:
            child_term.superclasses().add(parent_term)

def add_class_relationships(term_definitions, id_columns, terms_dict):
    """Adds 'is_a' relationships from terms to their classes."""
    class_col_name = id_columns["class_id"]["name"]
    class_separator = id_columns["class_id"]["separator"]
    class_series = term_definitions[class_col_name].dropna()
    for term_id, classes_str in class_series.items():
        term = terms_dict.get(term_id)
        if not term:
            continue
        for class_id in classes_str.split(class_separator):
            class_term = terms_dict.get(class_id.strip())
            if class_term:
                term.superclasses().add(class_term)

def generate_ontology_from_table(dataset: pd.DataFrame, id_columns: dict, metadata_ontology:dict):

    # Preprocess the dataset
    dataset = preprocess_dataset(dataset)

    # Create Ontology
    ontology = create_ontology()
    
    # Add ontology metadata
    add_ontology_metadata(ontology=ontology, metadata=metadata_ontology)

    # --- 1. Collect all unique IDs from all relevant columns ---
    mapping_ids = config_load["transformation"]["generate_ontology"]["columns_for_terms"]
    all_unique_ids = extract_all_ids(dataset, mapping_ids)


    # # --- 2. Create all terms ONCE and store them in a dictionary for fast access ---
    # terms_dict = {term_id: ontology.create_term(term_id) for term_id in all_unique_ids}
    terms_dict = create_all_terms(ontology=ontology, set_of_ids=all_unique_ids)

    # # --- 3. Add properties and relationships using fast, column-based operations ---

    # # Process only the rows that define a term (non-null term_id)
    column_terms = id_columns["term_id"]["name"]
    print(column_terms)
    term_definitions = dataset.dropna(subset=[column_terms]).set_index(column_terms)

    # 3.1 Add properties
    add_properties(dataset=dataset,
                   map_properties=ALL_PROPERTIES,
                   terms_dict=terms_dict,
                   column_id=column_terms
    )
    
                    

    # 3.2 Add Parent relationships (is_a) - NO .iterrows()
    add_parent_relationships(term_definitions=term_definitions,
                             id_columns=id_columns,
                             terms_dict=terms_dict
    )

    # 3.3 Add Class relationships (is_a) - NO .iterrows()
    add_class_relationships(term_definitions=term_definitions,
                             id_columns=id_columns,
                             terms_dict=terms_dict
    )
    
    return ontology



In [None]:
MAPPING_COLUMN_NAMES = config_load["transformation"]["rename_columns"]

METADATA_ONTOLOGY = config_load["transformation"]["generate_ontology"]["ontology_metadata"]

COLUMNS_FOR_RELATIONSHIPS = config_load["transformation"]["generate_ontology"]["columns_for_terms"]

ALL_PROPERTIES = config_load["transformation"]["generate_ontology"]["properties"]

PREFIXES_MAPPING_IDS = config_load["transformation"]["add_prefixes"]

In [None]:
df_lipids = preprocess_dataset(df_lipids)
df_lipids.head()

In [None]:
swissontology = generate_ontology_from_table(dataset=df_lipids,
                                             id_columns=COLUMNS_FOR_RELATIONSHIPS,
                                             metadata_ontology=METADATA_ONTOLOGY)

In [None]:
len(list(swissontology.terms())) # 779260

## SERVING STAGE

### Export Ontology file

In [None]:
def export_ontology(file_path, ontology, ontology_serializer:str="obo"):
    with open(file_path, "wb") as f:
        ontology.dump(f, format=ontology_serializer)

In [None]:
PARAMS_ONTOLOGY_FILE = SERVING_PARAMS["ontology"]

export_ontology(file_path=PARAMS_ONTOLOGY_FILE["path_file"],
                ontology=swissontology,
                ontology_serializer=PARAMS_ONTOLOGY_FILE["serializer"])

### Export Mapping file

In [None]:
def generate_mapping_file(filepath, format, delimiter, dataset, list_columns):
    # Select dataset columns
    dataframe = dataset[list_columns]

    # Store dataset as CSV
    dataframe.to_csv(filepath+"."+format, sep=delimiter, index=False)

    return None

In [None]:
PARAMS_MAPPING_FILE = SERVING_PARAMS["mapping_file"]

generate_mapping_file(filepath=PARAMS_MAPPING_FILE["path_file"],
                      format=PARAMS_MAPPING_FILE["format"],
                      delimiter=PARAMS_MAPPING_FILE["delimiter"],
                      dataset=df_lipids,
                      list_columns=TRANSFORMATION_PARAMS["generate_mapping_file"]["columns"])