In [1]:
!pip install pyld

Collecting pyld
  Obtaining dependency information for pyld from https://files.pythonhosted.org/packages/44/cd/80760be197a4bd08e7c136ef4bcb4a2c63fc799d8d91f4c177b21183135e/PyLD-2.0.4-py3-none-any.whl.metadata
  Downloading PyLD-2.0.4-py3-none-any.whl.metadata (10.0 kB)
Collecting frozendict (from pyld)
  Obtaining dependency information for frozendict from https://files.pythonhosted.org/packages/04/13/d9839089b900fa7b479cce495d62110cddc4bd5630a04d8469916c0e79c5/frozendict-2.4.6-py311-none-any.whl.metadata
  Downloading frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Downloading PyLD-2.0.4-py3-none-any.whl (70 kB)
   ---------------------------------------- 0.0/70.9 kB ? eta -:--:--
   ---------------------------------------- 70.9/70.9 kB 4.0 MB/s eta 0:00:00
Downloading frozendict-2.4.6-py311-none-any.whl (16 kB)
Installing collected packages: frozendict, pyld
Successfully installed frozendict-2.4.6 pyld-2.0.4




In [25]:
import json

# Step 1: Load the JSON file
with open('MODEL_PROVENANCE/RandomForest_Iris_v20250508_150811/RandomForest_Iris_v20250508_150811_run_summary.json', 'r') as f:

    run_data = json.load(f)

# Step 2: Define the extended relevant fields for each RQ
relevant_fields = {
    "RQ1.1_Data_Provenance": [
        "Internal_DBRepo_feature_names",
        "Internal_DBRepo_dropped_columns",
        "Internal_DBRepo_n_records",
        "FAIR_dataset_title",
        "FAIR_dataset_identifier",
        "FAIR_dataset_creator",
        "FAIR_dataset_license",
        "FAIR_dataset_access_url",
        "FAIR_dataset_documentation",
        "FAIR_dataset_keywords",
        "FAIR_dataset_publication_date",
        "FAIR_dataset_publisher",
        "MLSEA_dataPreprocessing"
    ],
    "RQ1.2_Model_Provenance": [
        "MLSEA_hyperparameters",
        "MLSEA_modelArchitecture",
        "MLSEA_trainingProcedure",
        "MLSEA_trainingCodeSnapshot",
        "MLSEA_evaluationMetrics",
        "ML_EXP_params",
        "ML_EXP_metrics",
        "mlflow.log-model.history",
        "ML_EXP_dataset_name",
        "ML_EXP_dataset_version",
        "ML_EXP_model_name",
        "ML_EXP_notebook_name"
    ],
    "RQ2_Metadata_Audit_Tracing": [
        "GIT_code_version",
        "GIT_current_commit_hash",
        "GIT_user",
        "GIT_user_email",
        "MLSEA_modelPath",
        "Internal_DBRepo_target_name",
        "MLSEA_performanceInterpretation",
        "ML_EXP_tags",
        "ML_EXP_artifacts"
    ],
    "RQ4_Schema_Mapping_Interoperability": [
        "PROV-O_prov_Activity",
        "PROV-O_prov_used",
        "PROV-O_prov_Entity",
        "PROV-O_prov_location",
        "PROV-O_prov_wasAssociatedWith",
        "PROV-O_prov_wasGeneratedBy",
        "FAIR4ML_target_variable",
        "FAIR4ML_ml_task",
        "FAIR4ML_serializationFormat",
        "FAIR4ML_dataset_dataset_type",
        "FAIR4ML_hasCO2eEmissions"
    ]
}

# Step 3: Group by section using both direct fields and tags
grouped_output = {}
for section, fields in relevant_fields.items():
    grouped_output[section] = {
        field: run_data.get(field) or run_data.get("ML_EXP_tags", {}).get(field, "Not available")
        for field in fields
    }

# Step 4: Save to JSON file
grouped_output_path = "grouped_run_metadata_extended.json"
with open(grouped_output_path, "w") as out:
    json.dump(grouped_output, out, indent=2)

import pprint
pprint.pprint(grouped_output)
grouped_output_path


{'RQ1.1_Data_Provenance': {'FAIR_dataset_access_url': 'https://archive.ics.uci.edu/dataset/53',
                           'FAIR_dataset_creator': 'R. A. Fisher',
                           'FAIR_dataset_documentation': 'https://archive.ics.uci.edu/dataset/53',
                           'FAIR_dataset_identifier': '10.24432/C56C76',
                           'FAIR_dataset_keywords': 'info not available',
                           'FAIR_dataset_license': '[]',
                           'FAIR_dataset_publication_date': '1936',
                           'FAIR_dataset_publisher': 'UCI Machine Learning '
                                                     'Repository',
                           'FAIR_dataset_title': 'Iris',
                           'Internal_DBRepo_dropped_columns': 'Not available',
                           'Internal_DBRepo_feature_names': 'Not available',
                           'Internal_DBRepo_n_records': 'Not available',
                           'MLSEA_da

'grouped_run_metadata_extended.json'

✅ Step 1: Create a PROV-O-aligned JSON-LD
Here's a Python script using rdflib to convert selected metadata into JSON-LD format under PROV-O terms.

In [26]:
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, DCTERMS, FOAF, XSD
import json

def export_full_provenance_rdf(grouped_metadata_path, output_basename="full_provenance"):
    with open(grouped_metadata_path, "r") as f:
        grouped = json.load(f)

    g = Graph()
    PROV = Namespace("http://www.w3.org/ns/prov#")
    SCHEMA = Namespace("http://schema.org/")
    MLS = Namespace("http://www.w3.org/ns/mls#")
    EX = Namespace("http://example.org/")
    g.bind("prov", PROV)
    g.bind("schema", SCHEMA)
    g.bind("mls", MLS)
    g.bind("dcterms", DCTERMS)
    g.bind("foaf", FOAF)
    g.bind("ex", EX)

    rq1 = grouped["RQ1.1_Data_Provenance"]
    rq2 = grouped["RQ2_Metadata_Audit_Tracing"]
    rq3 = grouped["RQ1.2_Model_Provenance"]
    rq4 = grouped["RQ4_Schema_Mapping_Interoperability"]

    model_name = rq3.get("ML_EXP_model_name", "unknown_model")
    dataset_uri = URIRef(EX[f"{model_name}_dataset"])
    activity_uri = URIRef(EX[f"{model_name}_training"])
    agent_uri = URIRef(EX["Reema_George_Dass"])

    def safe_literal(value):
        if value and value != "Not available":
            return Literal(value)
        return None

    def add_dict_as_nodes(parent_uri, predicate, data_dict):
        for k, v in data_dict.items():
            if v and v != "Not available":
                node = BNode()
                g.add((parent_uri, predicate, node))
                g.add((node, SCHEMA.name, Literal(k)))
                g.add((node, SCHEMA.value, Literal(str(v))))

    # Dataset
    g.add((dataset_uri, RDF.type, PROV.Entity))
    for field in [
        ("FAIR_dataset_title", DCTERMS.title),
        ("FAIR_dataset_identifier", DCTERMS.identifier),
        ("FAIR_dataset_creator", DCTERMS.creator),
        ("FAIR_dataset_license", DCTERMS.license),
        ("FAIR_dataset_documentation", DCTERMS.description),
        ("FAIR_dataset_access_url", SCHEMA.url),
        ("FAIR_dataset_keywords", SCHEMA.keywords),
        ("FAIR_dataset_publication_date", DCTERMS.issued),
        ("FAIR_dataset_publisher", DCTERMS.publisher),
    ]:
        val = safe_literal(rq1.get(field[0]))
        if val:
            g.add((dataset_uri, field[1], val))
    g.add((dataset_uri, PROV.wasGeneratedBy, activity_uri))
    g.add((dataset_uri, PROV.wasAttributedTo, agent_uri))

    # Agent
    g.add((agent_uri, RDF.type, PROV.Agent))
    g.add((agent_uri, FOAF.name, safe_literal(rq2.get("GIT_user", "Unknown"))))
    g.add((agent_uri, FOAF.mbox, safe_literal(rq2.get("GIT_user_email", ""))))

    # Activity
    g.add((activity_uri, RDF.type, PROV.Activity))
    g.add((activity_uri, PROV.wasAssociatedWith, agent_uri))
    if rq4.get("PROV-O_prov_used"):
        g.add((activity_uri, PROV.used, URIRef(rq4["PROV-O_prov_used"])))
    if rq4.get("FAIR4ML_ml_task"):
        g.add((activity_uri, MLS.taskType, Literal(rq4["FAIR4ML_ml_task"])))
    if rq4.get("PROV-O_prov_startedAtTime") and rq4["PROV-O_prov_startedAtTime"] != "info not available":
        g.add((activity_uri, PROV.startedAtTime, Literal(rq4["PROV-O_prov_startedAtTime"], datatype=XSD.dateTime)))
    if rq4.get("PROV-O_prov_endedAtTime"):
        g.add((activity_uri, PROV.endedAtTime, Literal(rq4["PROV-O_prov_endedAtTime"], datatype=XSD.dateTime)))
    if rq4.get("PROV-O_prov_location"):
        g.add((activity_uri, PROV.atLocation, URIRef(rq4["PROV-O_prov_location"])))
    if rq2.get("GIT_current_commit_hash"):
        g.add((activity_uri, PROV.value, Literal(f"Git commit: {rq2['GIT_current_commit_hash']}")))

    # Model architecture, training procedure, snapshot
    for field, pred in [
        ("MLSEA_modelArchitecture", MLS.modelArchitecture),
        ("MLSEA_trainingProcedure", MLS.trainingProcedure),
        ("MLSEA_trainingCodeSnapshot", SCHEMA.codeRepository)
    ]:
        val = safe_literal(rq3.get(field))
        if val:
            g.add((activity_uri, pred, val))

    # Evaluation Metrics
    try:
        metrics = rq3.get("ML_EXP_metrics", {})
        if isinstance(metrics, str):
            metrics = json.loads(metrics)
        add_dict_as_nodes(activity_uri, MLS.hasEvaluationMeasure, metrics)
    except Exception:
        pass

    # Hyperparameters
    try:
        params = rq3.get("ML_EXP_params", {})
        if isinstance(params, str):
            params = json.loads(params)
        add_dict_as_nodes(activity_uri, MLS.hasHyperParameter, params)
    except Exception:
        pass

    # Data preprocessing steps
    try:
        preprocessing = rq1.get("MLSEA_dataPreprocessing", {})
        if isinstance(preprocessing, str):
            preprocessing = json.loads(preprocessing)
        add_dict_as_nodes(activity_uri, MLS.dataPreparation, preprocessing)
    except Exception:
        pass

    # Justifications
    for k, v in rq2.get("ML_EXP_tags", {}).items():
        if k.startswith("justification_") or k.startswith("MLSEA_justification"):
            node = BNode()
            g.add((activity_uri, PROV.wasInfluencedBy, node))
            g.add((node, SCHEMA.name, Literal(k)))
            g.add((node, SCHEMA.description, Literal(v)))

    # wasDerivedFrom if retrained
    if "MLSEA_improvedFrom" in rq2.get("ML_EXP_tags", {}):
        previous = rq2["ML_EXP_tags"]["MLSEA_improvedFrom"]
        if previous and previous != "None":
            g.add((activity_uri, PROV.wasDerivedFrom, URIRef(EX[previous])))

    # Output files
    jsonld_path = f"Export_data/{output_basename}.jsonld"
    rdfxml_path = f"Export_data/{output_basename}.rdf"
    g.serialize(destination=jsonld_path, format="json-ld", indent=2)
    g.serialize(destination=rdfxml_path, format="xml")
    return jsonld_path, rdfxml_path

# Execute with all data
export_full_provenance_rdf("grouped_run_metadata_extended.json")


('Export_data/full_provenance.jsonld', 'Export_data/full_provenance.rdf')

In [29]:
!pip install --upgrade jinja2
!pip uninstall pyvis -y
!pip install pyvis==0.3.1







Found existing installation: pyvis 0.3.2
Uninstalling pyvis-0.3.2:
  Successfully uninstalled pyvis-0.3.2
Collecting pyvis==0.3.1
  Downloading pyvis-0.3.1.tar.gz (748 kB)
     ---------------------------------------- 0.0/748.9 kB ? eta -:--:--
      --------------------------------------- 10.2/748.9 kB ? eta -:--:--
     -------- ----------------------------- 174.1/748.9 kB 2.1 MB/s eta 0:00:01
     -------------------------------------- 748.9/748.9 kB 6.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [1 lines of output]
  ERROR: Can not execute `setup.py` since setuptools is not available in the build environment.
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

Encountered error while generating package metadata.

See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.


In [30]:
!pip install --upgrade setuptools wheel


Collecting setuptools
  Obtaining dependency information for setuptools from https://files.pythonhosted.org/packages/b1/93/dba5ed08c2e31ec7cdc2ce75705a484ef0be1a2fecac8a58272489349de8/setuptools-80.4.0-py3-none-any.whl.metadata
  Downloading setuptools-80.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting wheel
  Obtaining dependency information for wheel from https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl.metadata
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Downloading setuptools-80.4.0-py3-none-any.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------- ----------------------- 0.5/1.2 MB 7.9 MB/s eta 0:00:01
   ---------------------------------------  1.2/1.2 MB 10.9 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 10.9 MB/s eta 0:00:00
Downlo

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.75 requires requests_mock, which is not installed.
conda-repo-cli 1.0.75 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.75 requires python-dateutil==2.8.2, but you have python-dateutil 2.9.0.post0 which is incompatible.
conda-repo-cli 1.0.75 requires PyYAML==6.0.1, but you have pyyaml 6.0 which is incompatible.
mlprovlab 0.1.0 requires jupyterlab~=3.0, but you have jupyterlab 4.3.2 which is incompatible.


In [35]:
!pip install pyvis.network

Collecting pyvis.network




  Obtaining dependency information for pyvis.network from https://files.pythonhosted.org/packages/3e/b7/9ed53162e01d69ba5b3465f896c10e41e98ff71b2ee016b28a5e654e120e/pyvis_network-0.0.6-py3-none-any.whl.metadata
  Downloading pyvis_network-0.0.6-py3-none-any.whl.metadata (2.5 kB)
Downloading pyvis_network-0.0.6-py3-none-any.whl (49 kB)
   ---------------------------------------- 0.0/49.6 kB ? eta -:--:--
   --------------------------------- ------ 41.0/49.6 kB 991.0 kB/s eta 0:00:01
   ---------------------------------------- 49.6/49.6 kB 836.6 kB/s eta 0:00:00
Installing collected packages: pyvis.network
Successfully installed pyvis.network-0.0.6


In [None]:
!pip uninstall pyvis
!pip install setuptools==67.7.2  # ensure proper setuptools
!pip install pyvis==0.3.1


In [12]:
import json
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, XSD, DCTERMS, FOAF

# Load the original metadata file
with open('MODEL_PROVENANCE/RandomForest_Iris_v20250508_153648/RandomForest_Iris_v20250508_153648_run_summary.json', 'r') as f:
    run_data = json.load(f)

tags = run_data.get("ML_EXP_tags", {})

# Setup RDF graph
g = Graph()
PROV = Namespace("http://www.w3.org/ns/prov#")
EX = Namespace("http://example.org/")
g.bind("prov", PROV)
g.bind("ex", EX)
g.bind("dcterms", DCTERMS)
g.bind("foaf", FOAF)

# Core URIs
model_name = run_data.get("ML_EXP_model_name", "unknown_model")
activity_uri = URIRef(EX[f"{model_name}_training"])
dataset_uri = URIRef(EX[f"{model_name}_dataset"])
agent_uri = URIRef(EX["Reema_George_Dass"])

# Entity: Dataset
g.add((dataset_uri, RDF.type, PROV.Entity))
g.add((dataset_uri, DCTERMS.title, Literal(tags.get("FAIR_dataset_title", "Unknown Dataset"))))
g.add((dataset_uri, DCTERMS.identifier, Literal(tags.get("FAIR_dataset_identifier", ""))))
g.add((dataset_uri, DCTERMS.creator, Literal(tags.get("FAIR_dataset_creator", ""))))
g.add((dataset_uri, DCTERMS.license, Literal(str(tags.get("FAIR_dataset_license", "")))))
g.add((dataset_uri, DCTERMS.description, Literal(str(tags.get("FAIR_dataset_documentation", "")))))
g.add((dataset_uri, DCTERMS.source, URIRef(tags.get("FAIR_dataset_access_url", "http://example.org/source"))))
g.add((dataset_uri, PROV.wasGeneratedBy, activity_uri))
g.add((dataset_uri, PROV.wasAttributedTo, agent_uri))

# Agent
g.add((agent_uri, RDF.type, PROV.Agent))
g.add((agent_uri, FOAF.name, Literal(run_data.get("GIT_user", "Unknown Agent"))))

# Activity
g.add((activity_uri, RDF.type, PROV.Activity))
g.add((activity_uri, PROV.wasAssociatedWith, agent_uri))

# Optional: dataset used
if tags.get("PROV-O_prov_used"):
    g.add((activity_uri, PROV.used, URIRef(tags["PROV-O_prov_used"])))

# Optional: location of the dataset (prov:location)
if tags.get("PROV-O_prov_location"):
    g.add((dataset_uri, PROV.atLocation, URIRef(tags["PROV-O_prov_location"])))

# Optional: start and end times
if tags.get("PROV_startedAtTime"):
    g.add((activity_uri, PROV.startedAtTime, Literal(tags["PROV_startedAtTime"], datatype=XSD.dateTime)))
if tags.get("PROV-O_prov_endedAtTime"):
    g.add((activity_uri, PROV.endedAtTime, Literal(tags["PROV-O_prov_endedAtTime"], datatype=XSD.dateTime)))

# Serialize as JSON-LD
with open("prov_metadata.jsonld", "w") as f:
    f.write(g.serialize(format="json-ld", indent=2))

# And RDF/XML
with open("prov_metadata.rdf", "w") as f:
    f.write(g.serialize(format="xml"))

print("✅ Now enriched JSON-LD and RDF/XML include actual values and full provenance.")


✅ Now enriched JSON-LD and RDF/XML include actual values and full provenance.


✅ Step 2: Create RDF/XML (PROV-O)
Same structure, now exporting as RDF/XML:

In [7]:

# Serialize to RDF/XML
rdf_output = g.serialize(format="xml")
with open("prov_metadata.rdf", "w") as f:
    f.write(rdf_output)

print("✅ RDF/XML file generated: prov_metadata.rdf")


✅ RDF/XML file generated: prov_metadata.rdf
