In [38]:
from pathlib import Path
from typing import Optional, Dict
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDFS, DCTERMS, OWL


In [39]:

# --- Resolve paths relative to a notebook in notebooks/ (sibling to src/) ---
NB_DIR  = Path.cwd().resolve()
SRC_DIR = NB_DIR.parent / "src"
if not SRC_DIR.exists():
    alt = NB_DIR / "src"
    if alt.exists():
        SRC_DIR = alt
if not SRC_DIR.exists():
    raise FileNotFoundError(f"Could not find src/ near {NB_DIR}")

DATA_DIR = SRC_DIR / "data"
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Could not find src/data/ at {DATA_DIR}")

print("SRC_DIR :", SRC_DIR)
print("DATA_DIR:", DATA_DIR)

# --- Namespaces ---
QUDT = Namespace("http://qudt.org/schema/qudt/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

# --- Properties we’ll look in for textual definitions ---
DEF_PROPS = [
    SKOS.definition,
    URIRef("http://purl.obolibrary.org/obo/IAO_0000115"),  # IAO:definition
    RDFS.comment,
    DCTERMS.description,
    URIRef("http://schema.org/description"),
    QUDT.definition,
    QUDT.description,
    RDFS.label
]

SRC_DIR : /home/sanne/Projects/Ontology-Tradecraft/projects/project-3/assignment/src
DATA_DIR: /home/sanne/Projects/Ontology-Tradecraft/projects/project-3/assignment/src/data


In [40]:
def find_ttl_for(key: str) -> Optional[Path]:
    # First try exact common filenames
    candidates = [
        SRC_DIR / f"{key}.ttl",
        SRC_DIR / f"{key}-core.ttl",
    ]
    for c in candidates:
        if c.exists():
            return c
    # Fallback: fuzzy search in SRC_DIR *.ttl
    key_lower = key.lower()
    for p in SRC_DIR.glob("*.ttl"):
        if key_lower in p.stem.lower():
            return p
    return None

# Define the ontology file map, auto-discovered
ONTOLOGY_FILES: Dict[str, Path] = {}
for k in ["bfo-core", "ies", "ccom", "qudt", "ccot", "time"]:
    p = find_ttl_for(k)
    if p:
        ONTOLOGY_FILES[k] = p

print("ONTOLOGY_FILES:")
for k, v in ONTOLOGY_FILES.items():
    print(f"  {k:>4} -> {v.name}")

ONTOLOGY_FILES:
  bfo-core -> bfo-core.ttl
   ies -> ies.ttl
  ccom -> ccom.ttl
  qudt -> qudt.ttl
  ccot -> ccot.ttl
  time -> time.ttl


In [41]:
def load_graph(path: Path) -> Graph:
    # g = Graph()
    # g.parse(path)  # rdflib guesses turtle from extension
    # return g
    return parse_graph(path, follow_imports=True, depth=1)


def parse_graph(path: Path, follow_imports: bool, depth: int) -> Graph:
    """
    Parses an RDF graph from the specified file path and optionally follows
    and parses imported RDF graphs up to a specified depth. The function
    produces a combined graph containing the original RDF content and any
    imported content up to the specified depth.

    :param path: Path to the RDF file to be parsed
    :type path: Path
    :param follow_imports: Indicates whether to follow imports and parse
        imported RDF documents
    :type follow_imports: bool
    :param depth: Depth up to which imported documents should be parsed.
        A depth of 0 indicates that only the given RDF file will be parsed,
        without following any imports
    :type depth: int
    :return: Parsed RDF graph with content from the specified file and
        optionally its imports up to the specified depth
    :rtype: Graph
    """
    g = Graph(); g.parse(path.as_posix())
    if follow_imports and depth > 0:
        from rdflib import URIRef
        frontier = [o for o in g.objects(None, OWL.imports) if isinstance(o, URIRef)]
        seen = set(); d = 0
        while frontier and d < depth:
            nxt = []
            for iri in frontier:
                if iri in seen: continue
                seen.add(iri)
                try:
                    g.parse(str(iri))
                    nxt += [o for o in g.objects(None, OWL.imports) if isinstance(o, URIRef)]
                except Exception:
                    pass
            frontier = nxt; d += 1
    return g
def _pick_best_literal_text(values) -> Optional[str]:
    """
    From a set/list of RDF literals, choose:
      1) first with @en (any case), else
      2) first with no language, else
      3) the first value.
    """
    vals = [v for v in values if isinstance(v, Literal)]
    if not vals:
        return None

    def rank(lit: Literal) -> int:
        if lit.language and str(lit.language).lower() == "en":
            return 0
        if lit.language is None:
            return 1
        return 2

    vals.sort(key=rank)
    return str(vals[0]).strip()

def get_definition(g: Graph, iri: str) -> Optional[str]:
    """Return one textual definition for an IRI using DEF_PROPS, with language preference."""
    if not iri or not isinstance(iri, str):
        return None
    s = URIRef(iri)
    hits = []
    for prop in DEF_PROPS:
        for o in g.objects(s, prop):
            print(s, prop)
            if isinstance(o, Literal):
                hits.append(o)
    return _pick_best_literal_text(hits)

def discover_mapping_file(a: str, b: str) -> Optional[Path]:
    """
    Return the path to 'a-b-structural-matches.xlsx' or 'b-a-structural-matches.xlsx'
    if present in DATA_DIR; prefer a-b if both exist.
    """
    first  = DATA_DIR / f"{a}-{b}-structural-matches.xlsx"
    second = DATA_DIR / f"{b}-{a}-structural-matches.xlsx"
    if first.exists():
        return first
    if second.exists():
        return second
    return None

def find_iri_columns(df: pd.DataFrame):
    """
    Heuristic: pick the first two columns whose names contain 'iri' (case-insensitive).
    Falls back to ('left_iri', 'right_iri') or ('iri_left','iri_right') if present.
    """
    cols = [c for c in df.columns if "iri" in c.lower()]
    if len(cols) >= 2:
        return cols[0], cols[1]
    for pair in (("left_iri", "right_iri"), ("iri_left", "iri_right")):
        if all(c in df.columns for c in pair):
            return pair
    raise ValueError(
        "Could not detect two IRI columns in the mapping file. "
        f"Columns present: {list(df.columns)}"
    )

def augment_with_definitions(mapping_path: Path, left_key: str, right_key: str) -> Path:
    """
    Load a mapping Excel, look up definitions for left/right IRIs using the
    ontology TTLs for left_key/right_key (from ONTOLOGY_FILES), then write
    '*-with-defs.xlsx' alongside it with columns:
      - left_definition
      - right_definition
    """
    # Ensure the TTLs exist in the ONTOLOGY_FILES map
    left_ttl  = ONTOLOGY_FILES.get(left_key)
    right_ttl = ONTOLOGY_FILES.get(right_key)
    if not left_ttl or not right_ttl:
        raise FileNotFoundError(
            f"Missing TTL path for '{left_key}' or '{right_key}'. "
            f"Known: {ONTOLOGY_FILES}"
        )
    if not left_ttl.exists() or not right_ttl.exists():
        raise FileNotFoundError(f"Missing TTL: {left_ttl} or {right_ttl}")

    print(f"Reading {mapping_path.name} ...")
    df = pd.read_excel(mapping_path)
    left_col, right_col = find_iri_columns(df)

    # Load the ontologies
    g_left  = load_graph(left_ttl)
    g_right = load_graph(right_ttl)

    #print(df[left_col], df[right_col])

    # Add BOTH definition columns
    df["left_definition"]  = df[left_col].apply(lambda iri: get_definition(g_left, iri)  if pd.notna(iri) else None)
    df["right_definition"] = df[right_col].apply(lambda iri: get_definition(g_right, iri) if pd.notna(iri) else None)

    out_path = mapping_path.with_name(mapping_path.stem + "-with-defs.xlsx")
    df.to_excel(out_path, index=False)
    print(f"✅ Wrote: {out_path}")
    return out_path




In [42]:
pairs = [("bfo-core", "ies"), ("ccom", "qudt"), ("ccot", "time")]
pairs = [("ccom", "qudt")]

for (a, b) in pairs:
    mp = discover_mapping_file(a, b)
    if mp is None:
        print(f"⚠️  No mapping file found for {a}-{b} (or {b}-{a}) in {DATA_DIR}")
        continue
    try:
        augment_with_definitions(mp, a, b)
    except Exception as e:
        print(f"✗ Failed on {mp.name}: {e}")


Reading ccom-qudt-structural-matches.xlsx ...


http://www.linkedmodel.org/schema/?xml version="1.0"? does not look like a valid URI, trying to serialize this will break.
rdf:RDF
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:dcterms="http://purl.org/dc/terms/"
    xmlns:voag="http://voag.linkedmodel.org/voag#"
    xmlns:vaem="http://www.linkedmodel.org/schema/vaem#"
    xmlns:dtype="http://www.linkedmodel.org/schema/dtype#"
    xmlns:owl="http://www.w3.org/2002/07/owl#"
    xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
  xml:base="http://www.linkedmodel.org/schema/dtype"  does not look like a valid URI, trying to serialize this will break.
rdf:Description rdf:about="#dateUnion" does not look like a valid URI, trying to serialize this will break.
http://www.linkedmodel.org/schema/?xml version="1.0"? does not look like a valid URI, trying to serialize this will break.
rdf:RDF
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:voag="http:/

https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2004/02/skos/core#definition
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2000/01/rdf-schema#label
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2004/02/skos/core#definition
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2000/01/rdf-schema#label
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2004/02/skos/core#definition
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2000/01/rdf-schema#label
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2004/02/skos/core#definition
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2000/01/rdf-schema#label
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2004/02/skos/core#definition
https://www.commoncoreontologies.org/ont00000853 http://www.w3.org/2000/01/rdf-schema#label
https://www.commoncoreontologies.org/ont00000853 http://www.