In [14]:
import json
import glob
import os

def fetch_all_keys(json_path):
    """Recursively fetch all keys from a JSON file."""
    keys = set()

    def _recursive_extract(obj, prefix=""):
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                keys.add(full_key)
                _recursive_extract(v, prefix=full_key)
        elif isinstance(obj, list):
            for item in obj:
                _recursive_extract(item, prefix=prefix)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        _recursive_extract(data)

    return keys

# 🔥 Dynamically fetch keys from all MODEL_PROVENANCE run summaries
all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")

collected_keys = set()
for json_file in all_json_files:
    keys = fetch_all_keys(json_file)
    collected_keys.update(keys)

# ✅ Now build the mapping
mapping = {key: {"@id": key} for key in collected_keys}

# Special case for timestamps
if "start_time" in mapping:
    mapping["start_time"]["@id"] = "prov:startedAtTime"
    mapping["start_time"]["@type"] = "xsd:dateTime"
if "end_time" in mapping:
    mapping["end_time"]["@id"] = "prov:endedAtTime"
    mapping["end_time"]["@type"] = "xsd:dateTime"

# 🔥 Save mapping dynamically
os.makedirs("mappings", exist_ok=True)
with open("mappings/full_mapping.json", "w", encoding="utf-8") as f:
    json.dump(mapping, f, indent=2)

print(f"✅ Full dynamic mapping file created: mappings/full_mapping.json with {len(mapping)} fields!")


✅ Full dynamic mapping file created: mappings/full_mapping.json with 149 fields!


In [24]:
# import os
# import glob
# import json
# from datetime import datetime, timezone
# from rdflib import Graph

# import os
# def iso8601(ms):
#     """Convert milliseconds since epoch to ISO8601 UTC."""
#     return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()
# # Load the context mapping
# with open("mappings/full_mapping.json", "r", encoding="utf-8") as f:
#     ctx = json.load(f)

# # Loop through your run_summary files
# for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
#     basename   = os.path.basename(json_path)
#     model_name = basename.rsplit("_run_summary.json", 1)[0]

#     with open(json_path, "r", encoding="utf-8") as f:
#         summary = json.load(f)

#     doc = {
#         "@context": ctx,
#         "run_id": summary.get("run_id", ""),
#         "run_name": summary.get("run_name", ""),
#         "experiment_id": summary.get("experiment_id", ""),
#         "params": summary.get("params", {}),
#         "metrics": summary.get("metrics", {}),
#         "artifacts": summary.get("artifacts", []),
#         "tags": summary.get("tags", {}),
#         "start_time": iso8601(summary["start_time"])
#     }

#     if summary.get("end_time") is not None:
#         doc["end_time"] = iso8601(summary["end_time"])

#     doc["used"] = summary.get("tags", {}).get("dataset_uri") or []
#     doc["generated"] = [
#         art.get("uri") or art.get("path")
#         for art in summary.get("artifacts", [])
#     ]

#     # Save .jsonld
#     out_jsonld = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.jsonld")
#     with open(out_jsonld, "w", encoding="utf-8") as f:
#         json.dump(doc, f, indent=2)

#     # # Save .ttl
#     # g = Graph().parse(data=json.dumps(doc), format="json-ld")
#     # out_ttl = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.ttl")
#     # g.serialize(destination=out_ttl, format="turtle")

#     print(f"✅ Converted {basename} → {os.path.basename(out_jsonld)}")
import os
import glob
import json
from datetime import datetime, timezone

# Helper to convert milliseconds to ISO8601
def iso8601(ms):
    """Convert milliseconds since epoch to ISO8601 UTC."""
    return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()

# Helper to clean and build safe IDs
def safe_id(prefix, key):
    key = key.lower().replace(" ", "_").replace("/", "_").replace("\\", "_").replace("+", "_")
    return f"ex:{prefix}_{key}"

# Set up the base context
context = {
    "prov": "http://www.w3.org/ns/prov#",
    "ex": "http://example.org/mlprovenance#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "prov:value": {
        "@id": "prov:value"
    },
    "prov:location": {
        "@id": "prov:location"
    }
}

# Loop through your run_summary files
for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
    basename   = os.path.basename(json_path)
    model_name = basename.rsplit("_run_summary.json", 1)[0]

    with open(json_path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    graph = []

    # Create the main Run node (Activity)
    run_id = f"ex:run_{summary.get('run_id', model_name)}"
    run_node = {
        "@id": run_id,
        "@type": "prov:Activity",
        "prov:startedAtTime": iso8601(summary["start_time"]),
    }

    if summary.get("end_time") is not None:
        run_node["prov:endedAtTime"] = iso8601(summary["end_time"])

    # Collect linked nodes
    had_parameters = []
    had_quality = []
    used_entities = []
    generated_entities = []

    # Params -> prov:Entity ex:Parameter
    for param_name, param_value in summary.get("params", {}).items():
        param_id = safe_id("param", param_name)
        param_node = {
            "@id": param_id,
            "@type": ["prov:Entity", "ex:Parameter"],
            "prov:value": param_value
        }
        graph.append(param_node)
        had_parameters.append({"@id": param_id})

    # Metrics -> prov:Entity ex:Metric
    for metric_name, metric_value in summary.get("metrics", {}).items():
        metric_id = safe_id("metric", metric_name)
        metric_node = {
            "@id": metric_id,
            "@type": ["prov:Entity", "ex:Metric"],
            "prov:value": metric_value
        }
        graph.append(metric_node)
        had_quality.append({"@id": metric_id})

    # Artifacts -> prov:Entity ex:Artifact
    for artifact in summary.get("artifacts", []):
        art_path = artifact.get("path", "artifact_unknown")
        artifact_id = safe_id("artifact", art_path)
        artifact_node = {
            "@id": artifact_id,
            "@type": ["prov:Entity", "ex:Artifact"],
        }
        if "uri" in artifact:
            artifact_node["prov:location"] = artifact["uri"]
        graph.append(artifact_node)
        generated_entities.append({"@id": artifact_id})

    # Used dataset (optional)
    dataset_uri = summary.get("tags", {}).get("dataset_uri")
    if dataset_uri:
        dataset_id = safe_id("dataset", dataset_uri)
        dataset_node = {
            "@id": dataset_id,
            "@type": ["prov:Entity", "ex:Dataset"],
            "prov:location": dataset_uri
        }
        graph.append(dataset_node)
        used_entities.append({"@id": dataset_id})

    # Attach linked entities to run node
    if had_parameters:
        run_node["prov:hadParameter"] = had_parameters
    if had_quality:
        run_node["prov:hadQuality"] = had_quality
    if used_entities:
        run_node["prov:used"] = used_entities
    if generated_entities:
        run_node["prov:generated"] = generated_entities

    # Add run to the graph
    graph.insert(0, run_node)

    # Final JSON-LD document
    doc = {
        "@context": context,
        "@graph": graph
    }

    # Save .jsonld
    out_dir = os.path.join("MODEL_PROVENANCE", model_name)
    os.makedirs(out_dir, exist_ok=True)
    out_jsonld = os.path.join(out_dir, f"{model_name}.jsonld")
    with open(out_jsonld, "w", encoding="utf-8") as f:
        json.dump(doc, f, indent=2)

    print(f"✅ Full semantic JSON-LD created for {basename} -> {os.path.basename(out_jsonld)}")


✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_121328_run_summary.json -> RandomForest_Iris_v20250425_121328.jsonld
✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_125653_run_summary.json -> RandomForest_Iris_v20250425_125653.jsonld
✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_131407_run_summary.json -> RandomForest_Iris_v20250425_131407.jsonld
✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_132526_run_summary.json -> RandomForest_Iris_v20250425_132526.jsonld
✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_135553_run_summary.json -> RandomForest_Iris_v20250425_135553.jsonld
✅ Full semantic JSON-LD created for RandomForest_Iris_v20250425_135900_run_summary.json -> RandomForest_Iris_v20250425_135900.jsonld


In [5]:

from rdflib import Graph
from graphviz import Digraph
from IPython.display import display
import glob
import hashlib
import os

# Helper functions
def safe_id(text):
    return hashlib.md5(str(text).encode('utf-8')).hexdigest()

def pretty_label(uri):
    uri = str(uri)
    if "#" in uri:
        return uri.split("#")[-1]
    elif "/" in uri:
        return uri.split("/")[-1]
    return uri

# Step 1: Pick up all JSON-LD files
jsonld_files = glob.glob('MODEL_PROVENANCE/*/*.jsonld')

print(f"✅ Found {len(jsonld_files)} JSON-LD files.")

# Step 2: Loop through all JSON-LD files (NO break anymore!)
for file_path in jsonld_files:
    print(f"\n📄 Visualizing and Saving for: {file_path}")

    try:
        # Step 3: Parse the RDF Graph
        g = Graph()
        g.parse(file_path, format="json-ld")

        dot = Digraph(comment=f'Graph for {os.path.basename(file_path)}')
        dot.attr(rankdir='LR')  # left to right

        nodes = set()

        # Step 4: Build Nodes and Edges
        for subj, pred, obj in g:
            subj_id = safe_id(subj)
            obj_id = safe_id(obj)

            if subj_id not in nodes:
                dot.node(subj_id, label=pretty_label(subj))
                nodes.add(subj_id)
            if obj_id not in nodes:
                dot.node(obj_id, label=pretty_label(obj))
                nodes.add(obj_id)

            dot.edge(subj_id, obj_id, label=pretty_label(pred))

        # Step 5: Save PNG next to JSON-LD
        file_base = os.path.splitext(file_path)[0]  # removes ".jsonld"
        output_png_path = file_base + "JSONLD_viz.png"

        dot.render(file_base + "JSONLD_viz", format='png', cleanup=True)
        print(f"✅ PNG graph saved at: {output_png_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")

print("\n🏁 Finished generating all graph visualizations!")



✅ Found 6 JSON-LD files.

📄 Visualizing and Saving for: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\RandomForest_Iris_v20250425_121328.jsonld
✅ PNG graph saved at: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\RandomForest_Iris_v20250425_121328JSONLD_viz.png

📄 Visualizing and Saving for: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\RandomForest_Iris_v20250425_125653.jsonld
✅ PNG graph saved at: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\RandomForest_Iris_v20250425_125653JSONLD_viz.png

📄 Visualizing and Saving for: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\RandomForest_Iris_v20250425_131407.jsonld
✅ PNG graph saved at: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\RandomForest_Iris_v20250425_131407JSONLD_viz.png

📄 Visualizing and Saving for: MODEL_PROVENANCE\RandomForest_Iris_v20250425_132526\RandomForest_Iris_v20250425_132526.jsonld
✅ PNG graph saved at: MODEL_PROVENANCE\RandomForest_Iris_v20250425_132526\RandomForest_Iris_v20250425_1325

In [19]:
# import os
# import json
# import glob
# import pandas as pd
# from rdflib import Graph

# # ---------- Helper functions -------------

# def load_as_dict(path):
#     """Load a JSON or JSON-LD/Turtle file as dictionary."""
#     if path.endswith((".ttl", ".turtle")):
#         g = Graph()
#         g.parse(path, format="turtle")
#         return json.loads(g.serialize(format="json-ld", indent=2))
#     else:
#         with open(path, encoding="utf-8") as f:
#             return json.load(f)

# def compare_json(a, b, path=""):
#     """Recursively compare two JSON structures."""
#     diffs = []
#     if isinstance(a, dict) and isinstance(b, dict):
#         a = {k: v for k, v in a.items() if k != "@context"}
#         b = {k: v for k, v in b.items() if k != "@context"}
#         all_keys = set(a) | set(b)
#         for k in all_keys:
#             new_path = f"{path}/{k}" if path else k
#             if k not in a:
#                 diffs.append({"path": new_path, "type": "added", "a": None, "b": b[k]})
#             elif k not in b:
#                 diffs.append({"path": new_path, "type": "removed", "a": a[k], "b": None})
#             else:
#                 diffs.extend(compare_json(a[k], b[k], new_path))
#     elif isinstance(a, list) and isinstance(b, list):
#         for i, (ia, ib) in enumerate(zip(a, b)):
#             diffs.extend(compare_json(ia, ib, f"{path}[{i}]"))
#         if len(a) < len(b):
#             for i in range(len(a), len(b)):
#                 diffs.append({"path": f"{path}[{i}]", "type": "added", "a": None, "b": b[i]})
#         elif len(a) > len(b):
#             for i in range(len(b), len(a)):
#                 diffs.append({"path": f"{path}[{i}]", "type": "removed", "a": a[i], "b": None})
#     else:
#         if a != b:
#             diffs.append({"path": path, "type": "changed", "a": a, "b": b})
#     return diffs

# # ---------- Main comparison -------------

# # 1. Scan for all run folders
# base_dir = "MODEL_PROVENANCE"
# runs = [d for d in glob.glob(os.path.join(base_dir, "*")) if os.path.isdir(d)]

# # 2. Compare only JSON vs JSON-LD
# all_diffs = []

# for run_dir in runs:
#     model_name = os.path.basename(run_dir)
    
#     json_path = os.path.join(run_dir, f"{model_name}_run_summary.json")
#     jsonld_path = os.path.join(run_dir, f"{model_name}.jsonld")

#     if os.path.exists(json_path) and os.path.exists(jsonld_path):
#         try:
#             json_obj = load_as_dict(json_path)
#             jsonld_obj = load_as_dict(jsonld_path)

#             diffs = compare_json(json_obj, jsonld_obj)
#             if diffs:
#                 print(f"\n🔎 Differences for {model_name}: {len(diffs)} differences found")
#                 all_diffs.extend(diffs)
#             else:
#                 print(f"✅ {model_name}: No differences detected")
        
#         except Exception as e:
#             print(f"❌ Error comparing {model_name}: {e}")

#     else:
#         print(f"⚠️ Missing files in {model_name}: Skipping.")

# # 3. Summarize if needed
# if all_diffs:
#     df_diffs = pd.DataFrame(all_diffs)
#     print("\nSummary of all differences:")
#     print(df_diffs['type'].value_counts())
# else:
#     print("\n🎉 All JSON and JSON-LD files match perfectly!")




🔎 Differences for RandomForest_Iris_v20250425_121328: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_125653: 5 differences found

🔎 Differences for RandomForest_Iris_v20250425_131407: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_132526: 4 differences found

🔎 Differences for RandomForest_Iris_v20250425_135553: 5 differences found

🔎 Differences for RandomForest_Iris_v20250425_135900: 5 differences found

Summary of all differences:
type
added      12
removed     9
changed     6
Name: count, dtype: int64


In [21]:
# import json
# import glob
# import os

# def fetch_all_keys(json_path):
#     """Recursively fetch all keys from a JSON file."""
#     keys = set()

#     def _recursive_extract(obj, prefix=""):
#         if isinstance(obj, dict):
#             for k, v in obj.items():
#                 full_key = f"{prefix}.{k}" if prefix else k
#                 keys.add(full_key)
#                 _recursive_extract(v, prefix=full_key)
#         elif isinstance(obj, list):
#             for item in obj:
#                 _recursive_extract(item, prefix=prefix)

#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)
#         _recursive_extract(data)

#     return keys

# def create_mapping_from_keys(keys):
#     """Create a simple mapping where each key maps to itself, with special rules for timestamps."""
#     mapping = {}

#     for key in sorted(keys):
#         if "." not in key:
#             # Top-level fields
#             if key in ["start_time", "end_time"]:
#                 mapping[key] = {
#                     "@id": f"prov:{'startedAtTime' if key == 'start_time' else 'endedAtTime'}",
#                     "@type": "xsd:dateTime"
#                 }
#             elif key in ["run_id", "run_name", "experiment_id"]:
#                 mapping[key] = {"@id": key}
#             else:
#                 mapping[key] = {"@id": key}
#         else:
#             # Nested fields
#             mapping[key] = {"@id": key}

#     # Attach namespaces
#     mapping["@context"] = {
#         "prov": "http://www.w3.org/ns/prov#",
#         "xsd":  "http://www.w3.org/2001/XMLSchema#"
#     }

#     return mapping

# # --- Main execution ---

# # 1. Fetch keys from all JSONs
# all_keys = set()
# for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
#     keys = fetch_all_keys(json_path)
#     all_keys.update(keys)

# # 2. Create mapping
# mapping_dict = create_mapping_from_keys(all_keys)

# # 3. Save mapping
# output_dir = "mapping_files"
# os.makedirs(output_dir, exist_ok=True)
# output_path = os.path.join(output_dir, "dynamic_mapping.json")

# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(mapping_dict, f, indent=2)

# print(f"✅ Mapping file created: {output_path}")


✅ Mapping file created: mapping_files\dynamic_mapping.json


In [22]:
# import json
# import os
# import glob
# from rdflib import Graph
# from datetime import datetime, timezone

# # === Utility functions ===

# def iso8601(ms):
#     """Convert milliseconds since epoch to ISO8601 UTC."""
#     return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()

# def load_mapping(mapping_path="mapping_files/dynamic_mapping.json"):
#     """Load dynamic mapping file."""
#     with open(mapping_path, "r", encoding="utf-8") as f:
#         return json.load(f)

# def map_json_fields(summary, mapping):
#     """Apply dynamic field mapping to summary dict."""
#     doc = {"@context": mapping["@context"]}

#     for key, map_info in mapping.items():
#         if key == "@context":
#             continue

#         value = get_nested(summary, key)
#         if value is not None:
#             mapped_key = map_info["@id"]
#             # Apply ISO8601 if type is datetime
#             if map_info.get("@type") == "xsd:dateTime":
#                 value = iso8601(value)
#             doc[mapped_key] = value

#     return doc

# def get_nested(data, dotted_key):
#     """Safely get nested keys like artifacts.uri."""
#     parts = dotted_key.split(".")
#     for part in parts:
#         if isinstance(data, dict):
#             data = data.get(part)
#         else:
#             return None
#     return data

# # === Main execution ===

# # Load the dynamic mapping
# mapping = load_mapping()

# # Process all summaries
# all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")

# for json_path in all_json_files:
#     base_dir = os.path.dirname(json_path)
#     basename = os.path.basename(json_path)
#     model_name = basename.replace("_run_summary.json", "")

#     # Load JSON
#     with open(json_path, "r", encoding="utf-8") as f:
#         summary = json.load(f)

#     # Map using dynamic mapping
#     jsonld_doc = map_json_fields(summary, mapping)

#     # Save as .jsonld
#     jsonld_path = os.path.join(base_dir, f"{model_name}.jsonld")
#     with open(jsonld_path, "w", encoding="utf-8") as f:
#         json.dump(jsonld_doc, f, indent=2)

#     # Convert to .ttl
#     g = Graph()
#     g.parse(data=json.dumps(jsonld_doc), format="json-ld")
#     ttl_path = os.path.join(base_dir, f"{model_name}.ttl")
#     g.serialize(destination=ttl_path, format="turtle")

#     print(f"✅ {model_name}: JSON-LD and TTL generated")

# print("🚀 All model runs converted successfully!")


✅ RandomForest_Iris_v20250425_121328: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_125653: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_131407: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_132526: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_135553: JSON-LD and TTL generated
✅ RandomForest_Iris_v20250425_135900: JSON-LD and TTL generated
🚀 All model runs converted successfully!


In [23]:
# import json
# from rdflib import Graph

# def convert_jsonld_to_rdfxml(jsonld_path, rdfxml_out_path):
#     # 1. Load your JSON-LD
#     with open(jsonld_path, "r", encoding="utf-8") as f:
#         jsonld_data = json.load(f)

#     # 2. Parse it into an RDF Graph
#     g = Graph()
#     g.parse(data=json.dumps(jsonld_data), format="json-ld")

#     # 3. Serialize it into RDF/XML
#     g.serialize(destination=rdfxml_out_path, format="xml")

#     print(f"✅ RDF/XML written to {rdfxml_out_path}")

# # Example usage
# convert_jsonld_to_rdfxml(
#     "MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.jsonld",
#     "MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.rdf"
# )


✅ RDF/XML written to MODEL_PROVENANCE/RandomForest_Iris_v20250425_135900/RandomForest_Iris_v20250425_135900.rdf


In [28]:
# import os
# import glob
# import json

# def fetch_all_keys(json_path):
#     """Recursively fetch all JSON keys in dot notation."""
#     keys = set()

#     def _recursive_extract(obj, prefix=""):
#         if isinstance(obj, dict):
#             for k, v in obj.items():
#                 full_key = f"{prefix}.{k}" if prefix else k
#                 keys.add(full_key)
#                 _recursive_extract(v, prefix=full_key)
#         elif isinstance(obj, list):
#             for item in obj:
#                 _recursive_extract(item, prefix=prefix)

#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)
#         _recursive_extract(data)

#     return keys

# # 🔥 Fetch all keys from MODEL_PROVENANCE
# all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")
# collected_keys = set()

# for json_file in all_json_files:
#     keys = fetch_all_keys(json_file)
#     collected_keys.update(keys)

# print(f"🔎 Found {len(collected_keys)} unique fields across {len(all_json_files)} run summaries.")

# # ✨ Auto-build the mapping: JSON key ➔ RDF property
# mapping = {}

# for key in sorted(collected_keys):
#     rdf_key = key.replace(".", "_")  # replace dot with underscore
#     mapping[key] = f"prov:{rdf_key}"

# # 📂 Save to file
# os.makedirs("mappings", exist_ok=True)
# mapping_file = os.path.join("mappings", "json_to_rdf_mapping.json")

# with open(mapping_file, "w", encoding="utf-8") as f:
#     json.dump(mapping, f, indent=2)

# print(f"✅ Mapping created and saved at: {mapping_file}")


🔎 Found 149 unique fields across 6 run summaries.
✅ Mapping created and saved at: mappings\json_to_rdf_mapping.json


In [29]:
# import json
# import glob
# import os
# from rdflib import Graph, URIRef, Literal, Namespace, RDF
# from rdflib.namespace import XSD

# # Load your dynamic JSON ➔ RDF mapping
# with open("mappings/json_to_rdf_mapping.json", "r", encoding="utf-8") as f:
#     field_mapping = json.load(f)

# prov = Namespace("http://www.w3.org/ns/prov#")

# def flatten_json(obj, parent_key=''):
#     """Flatten nested JSON with dot notation."""
#     items = []
#     if isinstance(obj, dict):
#         for k, v in obj.items():
#             new_key = f"{parent_key}.{k}" if parent_key else k
#             items.extend(flatten_json(v, new_key))
#     elif isinstance(obj, list):
#         for i, v in enumerate(obj):
#             new_key = f"{parent_key}[{i}]"
#             items.extend(flatten_json(v, new_key))
#     else:
#         items.append((parent_key, obj))
#     return items

# def create_rdf_from_json(json_path):
#     """Given a run_summary JSON file, create RDF/XML."""
#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     # Flatten the JSON
#     flat_data = dict(flatten_json(data))

#     # Build RDF graph
#     g = Graph()
#     g.bind("prov", prov)

#     # Create a blank subject (could also use the run_id if you want)
#     subj = URIRef(f"urn:uuid:{data.get('run_id', 'unknown-run')}")

#     for key, value in flat_data.items():
#         if key in field_mapping:
#             pred = URIRef(field_mapping[key].replace("prov:", str(prov)))
#             if isinstance(value, (int, float)):
#                 obj = Literal(value)
#             else:
#                 obj = Literal(str(value))
#             g.add((subj, pred, obj))
#         else:
#             # Keys that don't have mapping: (skip or warn)
#             pass

#     return g

# # 🔥 Process all run summaries
# os.makedirs("rdf_exports", exist_ok=True)

# for json_file in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
#     model_name = os.path.basename(json_file).replace("_run_summary.json", "")

#     rdf_graph = create_rdf_from_json(json_file)

#     # Save as RDF/XML
#     out_path = os.path.join("rdf_exports", f"{model_name}.rdf")
#     rdf_graph.serialize(destination=out_path, format="xml")

#     print(f"✅ RDF/XML created: {out_path}")


✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_121328.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_125653.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_131407.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_132526.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_135553.rdf
✅ RDF/XML created: rdf_exports\RandomForest_Iris_v20250425_135900.rdf


RDF/XML conversion from JSON

In [3]:
import glob
import os
import json
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, XSD
from datetime import datetime, timezone

# Define namespaces
PROV = Namespace("http://www.w3.org/ns/prov#")
EX = Namespace("http://example.org/mlprovenance#")

# 🔥 Fetch all run summaries
all_json_files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")

for json_path in all_json_files:
    print(f"\n🔍 Processing: {json_path}")

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Create a graph for each run
    g = Graph()
    g.bind("prov", PROV)
    g.bind("ex", EX)

    run_uri = EX[f"run_{data['run_id']}"]
    g.add((run_uri, RDF.type, PROV.Activity))

    # Start time
    def convert_millis_to_iso(ms):
        if isinstance(ms, (int, float, str)) and str(ms).isdigit():
            ms = int(ms)
            return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()
        return ms

    if data.get("start_time"):
        safe_start_time = convert_millis_to_iso(data["start_time"])
        g.add((run_uri, PROV.startedAtTime, Literal(safe_start_time, datatype=XSD.dateTime)))

    if data.get("end_time"):
        safe_end_time = convert_millis_to_iso(data["end_time"])
        g.add((run_uri, PROV.endedAtTime, Literal(safe_end_time, datatype=XSD.dateTime)))

    for param, value in data.get('params', {}).items():
        param_entity = URIRef(run_uri + f"/param/{param}")
        g.add((param_entity, RDF.type, PROV.Entity))
        g.add((param_entity, PROV.value, Literal(str(value))))
        g.add((run_uri, PROV.hadParameter, param_entity))

    for metric, value in data.get('metrics', {}).items():
        metric_entity = URIRef(run_uri + f"/metric/{metric}")
        g.add((metric_entity, RDF.type, PROV.Entity))
        try:
            val = float(value)
            g.add((metric_entity, PROV.value, Literal(val, datatype=XSD.float)))
        except (ValueError, TypeError):
            g.add((metric_entity, PROV.value, Literal(str(value))))
        g.add((run_uri, PROV.hadQuality, metric_entity))

    for tag, value in data.get('tags', {}).items():
        tag_entity = URIRef(run_uri + f"/tag/{tag}")
        g.add((tag_entity, RDF.type, PROV.Entity))
        g.add((tag_entity, PROV.value, Literal(str(value))))
        g.add((run_uri, PROV.used, tag_entity))

    for artifact in data.get('artifacts', []):
        artifact_id = artifact.get('path', '').replace("/", "_").replace("\\", "_")
        artifact_entity = URIRef(run_uri + f"/artifact/{artifact_id}")
        g.add((artifact_entity, RDF.type, PROV.Entity))
        g.add((artifact_entity, PROV.location, Literal(artifact.get('uri', ''))))
        g.add((run_uri, PROV.generated, artifact_entity))

    # 🌟 Save RDF/XML directly into same folder as JSON
    json_dir = os.path.dirname(json_path)  # where the JSON lives
    run_id_safe = data['run_id']
    out_rdfxml = os.path.join(json_dir, f"{run_id_safe}.xml")

    g.serialize(destination=out_rdfxml, format='xml')
    print(f"✅ RDF/XML created for {run_id_safe}: {out_rdfxml}")



🔍 Processing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\RandomForest_Iris_v20250425_121328_run_summary.json
✅ RDF/XML created for 28f01e38b7f04d2f948fe21f57f41d0c: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\28f01e38b7f04d2f948fe21f57f41d0c.xml

🔍 Processing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\RandomForest_Iris_v20250425_125653_run_summary.json
✅ RDF/XML created for 68d5dd35a5354061bf02395d2243b624: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\68d5dd35a5354061bf02395d2243b624.xml

🔍 Processing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\RandomForest_Iris_v20250425_131407_run_summary.json
✅ RDF/XML created for 8f7521eaa562415d9a450f4167a127ab: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\8f7521eaa562415d9a450f4167a127ab.xml

🔍 Processing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_132526\RandomForest_Iris_v20250425_132526_run_summary.json
✅ RDF/XML created for 78e6e34ac94a460a893791a3e02f6da7: MODEL_PROVENANCE\RandomForest_Iri

In [4]:
from rdflib import Graph
from graphviz import Digraph
import glob
import os
import re
import hashlib

# Helper functions
def sanitize(text):
    text = str(text)
    text = text.replace("\\", "\\\\")  # Escape backslashes
    text = re.sub(r'(["])', r'\\\1', text)  # Escape double quotes inside too
    return '"' + text + '"'

def simple_id(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def short_uri(uri):
    uri = str(uri)
    if "#" in uri:
        return uri.split("#")[-1]
    elif "/" in uri:
        return uri.split("/")[-1]
    return uri

# 🧠 Now fetch RDF/XML files correctly
rdf_files = glob.glob('MODEL_PROVENANCE/*/*.xml')

print(f"✅ Found {len(rdf_files)} RDF/XML files to visualize.")

for rdf_path in rdf_files:
    print(f"🎨 Visualizing: {rdf_path}")

    g = Graph()
    g.parse(rdf_path, format='xml')

    dot = Digraph(comment=f'Graph for {os.path.basename(rdf_path)}')
    dot.attr(rankdir='LR')  # Left-to-right layout

    nodes = set()

    for subj, pred, obj in g:
        subj_id = simple_id(str(subj))
        obj_id = simple_id(str(obj))
        pred_label = short_uri(pred)

        if subj_id not in nodes:
            dot.node(subj_id, label=short_uri(subj))
            nodes.add(subj_id)
        if obj_id not in nodes:
            dot.node(obj_id, label=short_uri(obj))
            nodes.add(obj_id)

        dot.edge(subj_id, obj_id, label=pred_label)

    # 🌟 Save PNG visualization right next to the .xml
    output_base = os.path.splitext(rdf_path)[0]
    output_path = output_base + "RDFXML_viz"  # Don't overwrite the XML!

    dot.render(output_path, format='png', cleanup=True)
    print(f"✅ PNG saved: {output_path}.png")


✅ Found 6 RDF/XML files to visualize.
🎨 Visualizing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\28f01e38b7f04d2f948fe21f57f41d0c.xml
✅ PNG saved: MODEL_PROVENANCE\RandomForest_Iris_v20250425_121328\28f01e38b7f04d2f948fe21f57f41d0cRDFXML_viz.png
🎨 Visualizing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\68d5dd35a5354061bf02395d2243b624.xml
✅ PNG saved: MODEL_PROVENANCE\RandomForest_Iris_v20250425_125653\68d5dd35a5354061bf02395d2243b624RDFXML_viz.png
🎨 Visualizing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\8f7521eaa562415d9a450f4167a127ab.xml
✅ PNG saved: MODEL_PROVENANCE\RandomForest_Iris_v20250425_131407\8f7521eaa562415d9a450f4167a127abRDFXML_viz.png
🎨 Visualizing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_132526\78e6e34ac94a460a893791a3e02f6da7.xml
✅ PNG saved: MODEL_PROVENANCE\RandomForest_Iris_v20250425_132526\78e6e34ac94a460a893791a3e02f6da7RDFXML_viz.png
🎨 Visualizing: MODEL_PROVENANCE\RandomForest_Iris_v20250425_135553\3ec1102377b049589537b68a9494fbf