########################################################
# EXPERIMENT CODE
########################################################

In [1]:
import os
import getpass
import platform
import sys
import uuid
from datetime import datetime

def prompt_if_none(env_key, prompt_text, default_value="unknown"):
    val = os.getenv(env_key)
    if not val:
        try:
            val = input(f"{prompt_text} (default: {default_value}): ").strip() or default_value
        except Exception:
            val = default_value
    return val

def collect_session_metadata(
    prompt_fields=True,
    fixed_role=None,
    fixed_project_id=None
):
    session_id = str(uuid.uuid4())
    
    session_metadata = {
        "session_id": session_id,
        "username": os.getenv("JUPYTERHUB_USER", getpass.getuser()),
        "timestamp_utc": datetime.utcnow().isoformat(),
        "hostname": platform.node(),
        "platform": platform.system(),
        "os_version": platform.version(),
        "python_version": sys.version.split()[0],
    }

    # Prompt or use defaults
    session_metadata["role"] = fixed_role or (
        prompt_if_none("RESEARCHER_ROLE", "Enter your role", "collaborator") if prompt_fields 
        else os.getenv("RESEARCHER_ROLE", "researcher")
    )
    session_metadata["project_id"] = fixed_project_id or (
        prompt_if_none("PROJECT_ID", "Enter project ID", "default_project") if prompt_fields 
        else os.getenv("PROJECT_ID", "default_project")
    )

    print("\n📌 Session Metadata:")
    for k, v in session_metadata.items():
        print(f"  {k}: {v}")

    return session_metadata


LIBRARY IMPORTS:

In [3]:
# ============================
# 📦 Standard Library Imports
# ============================
import os
import glob
import io
import json
import time
import ast
import pickle
import platform
import subprocess
from datetime import datetime, timezone
from pprint import pprint
from typing import List, Dict, Any
import xml.etree.ElementTree as ET
import urllib.parse
import yaml

# ============================
# 📊 Data and Visualization
# ============================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# ============================
# 🤖 Machine Learning
# ============================
import sklearn
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    RocCurveDisplay,
    PrecisionRecallDisplay
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# ============================
# 🔬 Experiment Tracking
# ============================
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient

# ============================
# 🌐 Web / API / Networking
# ============================
import requests
from dotenv import load_dotenv

# ============================
# 🧪 Git & Version Control
# ============================
import git
from git import Repo, GitCommandError
import hashlib


# ============================
# 🧠 SHAP for Explainability
# ============================
import shap

# ============================
# 🧬 RDF & Provenance (rdflib)
# ============================
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import PROV, XSD

# ============================
# ⚙️ System Monitoring
# ============================
import psutil


#Dataset metadata!!!

#Metadata from ZONEDO

In [5]:
import requests

def extract_dataset_metadata_from_doi(doi: str) -> dict:
    base_url = f"https://api.datacite.org/dois/{doi.lower()}"
    r = requests.get(base_url)
    r.raise_for_status()
    meta = r.json().get("data", {}).get("attributes", {})

    # Extract fields
    title = meta.get("titles", [{}])[0].get("title", "info not available")
    creators = [c.get("name", "") for c in meta.get("creators", [])]
    publisher = meta.get("publisher", "info not available")
    pub_year = meta.get("publicationYear", "info not available")
    url = meta.get("url", f"https://doi.org/{doi}")

    dataset_metadata = {
        "dataset_id": doi,
        "dataset_title": title,
        "dataset_description": meta.get("descriptions", "info not available"),
        "dataset_creator": ", ".join(creators) if creators else "info not available",
        "dataset_publisher": publisher,
        "dataset_publication_date": pub_year,
        "dataset_version": meta.get("version", "info not available"),
        "dataset_license": meta.get("rightsList", "info not available"),
        "dataset_keywords": "info not available",  # not always exposed
        "dataset_access_url": url,
        "dataset_documentation": url,
        "metadata_standard": meta.get("types", {}).get("resourceTypeGeneral", "info not available"),
        "related_resources": url,

        # PROV-O traceability fields
        "prov_entity": title,
        "prov_activity": "Ingestion and Publication",
        "prov_agent_dataset_creator": ", ".join(creators) if creators else "info not available",
        "prov_used": url,
        "prov_wasDerivedFrom": doi,
        "prov_wasAttributedTo": ", ".join(creators) if creators else "info not available",
        "prov_startedAtTime": pub_year,
        "prov_role_dataset_creator": "Original Data Author",
        "prov_role_database_creator": "Database Ingestor and Maintainer"
    }

    return dataset_metadata


In [6]:
# extract_dataset_metadata_from_doi("10.24432/C56C76") #dataset related metadata logging 


In [8]:
import requests

DB_API = "http://localhost/api/database/{db_id}"
HISTORY_API = "http://localhost/api/database/{db_id}/table/{table_id}/history"

def fetch_db_dataset_metadata(
    db_id: str,
    table_id: str,
    selected_version: str,
    target_variable: str,
    num_samples: int
) -> dict:
    try:
        # Fetch main DB metadata
        db_url = DB_API.format(db_id=db_id)
        db_response = requests.get(db_url)
        db_response.raise_for_status()
        db_data = db_response.json()
        print(db_data)

        # Fetch table history metadata
        history_url = HISTORY_API.format(db_id=db_id, table_id=table_id)
        history_response = requests.get(history_url)
        timestamp = "info not available"
        if history_response.status_code == 200:
            history_data = history_response.json()
            print(history_data)
            if isinstance(history_data, list) and len(history_data) > 0:
                timestamp = history_data[0].get("timestamp", timestamp)

        # Build flat metadata structure for DB storage
        dataset_metadata = {
            # Basic identity
            "dataset_id": table_id,
            "dataset_name": next(
                (t.get("name") for t in db_data.get("tables", []) if t.get("id") == table_id),
                "table name not available"
            ),
            "dataset_version": selected_version,
            "dataset_title": db_data.get("name", "info not available"),
            "dataset_description": db_data.get("description", "info not available"),

            # Ownership and access
            "dataset_creator": "info not available",
            "dataset_publisher": db_data.get("owner", {}).get("name", "info not available"),
            "dataset_access_url": db_url,
            "dataset_publication_date": timestamp,
            "dataset_license": "info not available",

            # Structure
            "columns": db_data.get("columns", "info not available"),
            "dataset_dataset_type": "tabular",
            "target_variable": target_variable,
            "ml_task": "classification",
            "num_samples": num_samples,

            # FAIR4ML placeholders
            "data_distribution": "info not available",
            "known_issues": "info not available",
            "trainedOn": "info not available",
            "testedOn": "info not available",
            "validatedOn": "info not available",
            "modelRisks": "info not available",
            "usageInstructions": "info not available",
            "ethicalLegalSocial": "info not available",

            # PROV-style fields
            "prov_entity": db_data.get("name", "info not available"),
            "prov_activity": "Ingestion and Publication",
            "prov_agent_dataset_creator": "info not available",
            "prov_agent_database_creator": db_data.get('owner', {}).get('name', 'info not available'),
            "prov_wasGeneratedBy": db_data.get('owner', {}).get('name', 'info not available'),
            "prov_used": db_url,
            "prov_wasDerivedFrom": "info not available",
            "prov_wasAttributedTo": "info not available",
            "prov_wasAssociatedWith": db_data.get('owner', {}).get('name', 'info not available'),
            "prov_startedAtTime": "info not available",
            "prov_endedAtTime": timestamp,
            "prov_location": db_url,
            "prov_role_dataset_creator": "",
            "prov_role_database_creator": "Database Ingestor and Maintainer"
        }

        return dataset_metadata

    except requests.exceptions.RequestException as e:
        print(f"[⚠️ Error] Failed to fetch DB metadata for {db_id}: {e}")
        return {}


Fetch info needed to fetch metadata:

In [14]:
# Mapping of version tags to table UUIDs
version_to_table_id = {
    "v0": "519eb3fc-687c-4791-aa13-96d5bee8cbad",  # Original
    "v1": "3fd0f36e-572e-4f99-841b-a8381a052a97",  # Duplicated
    "v2": "2a8083fa-8270-49c1-80ea-86ce6bf39977",  # First 100
    "v3": "14cc6f38-b5c6-4225-83ce-3dc92b7c045a",  # Shuffled
    "v4": "3cb219b2-8cc6-4698-b69f-213deacc763c"   # Normalized
}

db_id = "4bd4ddc7-378c-4ffa-8bdb-0bf8969c80a1"  # Static DB ID

def select_dataset_version():
    print("Select dataset version:")
    print("  v0 - Original")
    print("  v1 - Duplicated")
    print("  v2 - First 100")
    print("  v3 - Shuffled")
    print("  v4 - Normalized")
    
    selected_version = input("Enter version (v0–v4): ").strip().lower()
    
    if selected_version not in version_to_table_id:
        raise ValueError(f"❌ Invalid version selected: {selected_version}")
    
    selected_table_id = version_to_table_id[selected_version]
    
    print(f"\n✅ You selected version '{selected_version}' → Table ID: {selected_table_id}\n")
    
    return selected_version, selected_table_id

# Usage: #TODO CALL
selected_version, selected_table_id = select_dataset_version()


Select dataset version:
  v0 - Original
  v1 - Duplicated
  v2 - First 100
  v3 - Shuffled
  v4 - Normalized


Enter version (v0–v4):  v4



✅ You selected version 'v4' → Table ID: 3cb219b2-8cc6-4698-b69f-213deacc763c



In [10]:
import os
import json
import mlflow
# 
def log_metadata_dict_to_mlflow(metadata: dict, prefix: str = "", snapshot_name: str = "metadata_snapshot.json"):
    """
    Logs a flat metadata dictionary to MLflow:
    - Adds prefix to each key if provided (e.g., "session_")
    - Skips empty values
    - Logs a full JSON artifact for traceability
    """
    
    def safe_tag(key, value):
        if not mlflow.active_run():
            raise RuntimeError("❌ No active MLflow run.")
        
        key_clean = key.replace(":", "_").replace("/", "_").replace(" ", "_")
        try:
            val_str = json.dumps(value) if isinstance(value, (dict, list)) else str(value)
            if len(val_str) > 5000:
                val_str = val_str[:5000] + "...[TRUNCATED]"
            if len(key_clean) > 255:
                print(f"⚠️ Skipped tag (key too long): {key_clean}")
                return
            mlflow.set_tag(key_clean, val_str)
            print(f"✅ Logged tag: {key_clean}")
        except Exception as e:
            print(f"[⚠️ Error logging tag] {key_clean}: {e}")

    for key, value in metadata.items():
        if value not in [None, ""]:
            full_key = f"{prefix}{key}" if prefix else key
            safe_tag(full_key, value)

    # Save full metadata snapshot as JSON artifact
    os.makedirs("metadata", exist_ok=True)
    full_path = os.path.join("metadata", snapshot_name)
    with open(full_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2)
    
    mlflow.log_artifact(full_path, artifact_path="metadata")
    print(f"📁 Full metadata snapshot logged as: {snapshot_name}")


DBREPO INTEGRETION: API call to fetch the dataset for training

In [15]:
# API endpoint URL
API_URL = f"http://localhost/api/database/{db_id}/table/{selected_table_id}/data?size=100000&page=0"

# Define the headers
headers = {
    "Accept": "application/json"  # Specify the expected response format
}

try:
    # Send a GET request to the API with the Accept header
    response = requests.get(API_URL, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        dataset = response.json()
        
        
        print( dataset)
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)
       

except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")


[{'id': '1', 'sepallengthcm': '0.222222222222222100', 'sepalwidthcm': '0.625000000000000000', 'petallengthcm': '0.067796610169491510', 'petalwidthcm': '0.041666666666666670', 'species': 'Iris-setosa'}, {'id': '2', 'sepallengthcm': '0.166666666666666740', 'sepalwidthcm': '0.416666666666666740', 'petallengthcm': '0.067796610169491510', 'petalwidthcm': '0.041666666666666670', 'species': 'Iris-setosa'}, {'id': '3', 'sepallengthcm': '0.111111111111111160', 'sepalwidthcm': '0.500000000000000000', 'petallengthcm': '0.050847457627118650', 'petalwidthcm': '0.041666666666666670', 'species': 'Iris-setosa'}, {'id': '4', 'sepallengthcm': '0.083333333333333260', 'sepalwidthcm': '0.458333333333333260', 'petallengthcm': '0.084745762711864400', 'petalwidthcm': '0.041666666666666670', 'species': 'Iris-setosa'}, {'id': '5', 'sepallengthcm': '0.194444444444444420', 'sepalwidthcm': '0.666666666666666700', 'petallengthcm': '0.067796610169491510', 'petalwidthcm': '0.041666666666666670', 'species': 'Iris-seto

replacing dynamic fetching of data When and if DBREPO isnt running (BACKUP)

In [12]:
# # 1. Read the JSON file id the API isnt available this data is saved locally but the data is from the API endpoint
# with open("iris_data.json", "r") as f:
#     dataset = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'iris_data.json'

# ============================
# 📂 Setup MLflow
# ============================

In [13]:
import os
import mlflow

# Ensure tracking directory exists
project_dir = os.getcwd()
mlrunlogs_dir = os.path.join(project_dir, "mlrunlogs")
os.makedirs(mlrunlogs_dir, exist_ok=True)

# Set MLflow tracking URI (local SQLite backend)
mlflow_tracking_path = os.path.join(mlrunlogs_dir, "mlflow.db")
mlflow.set_tracking_uri("mlrunlogs/mlflow.db")

# Prompt for experiment name
experiment_name = input("Enter experiment name for MLflow: ").strip()
if not experiment_name:
    experiment_name = "default_experiment"
    print("⚠️ No name entered. Using fallback:", experiment_name)

mlflow.set_experiment(experiment_name)


Enter experiment name for MLflow:  efrgtr


2025/05/18 16:59:10 INFO mlflow.tracking.fluent: Experiment with name 'efrgtr' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/reema/REPO/notebooks/RQ_notebooks/mlrunlogs/mlflow.db/225271718569336729', creation_time=1747580350310, experiment_id='225271718569336729', last_update_time=1747580350310, lifecycle_stage='active', name='efrgtr', tags={}>

# ============================
# 🔄 Git Commit Hash for previous commit for metadata
# ============================

In [14]:
import git
import os

def get_latest_git_commit(repo_path: str = "C:/Users/reema/REPO") -> dict:
    """
    Returns the latest Git commit metadata from the given repo path.
    """
    try:
        repo = git.Repo(repo_path)
        commit = repo.head.commit
        commit_metadata = {
            "git_commit": commit.hexsha,
            "git_author": commit.author.name,
            "git_email": commit.author.email,
            "git_commit_time": str(commit.committed_datetime),
            "git_message": commit.message.strip(),
            "git_branch": repo.active_branch.name if not repo.head.is_detached else "detached"
        }
        return commit_metadata

    except Exception as e:
        print(f"[⚠️ Git Error] Could not read Git repo at {repo_path}: {e}")
        return {
            "git_commit": "not available",
            "git_author": "not available",
            "git_email": "not available",
            "git_commit_time": "not available",
            "git_message": "not available",
            "git_branch": "not available"
        }

# Usage
repo_dir = "C:/Users/reema/REPO"
git_metadata = get_latest_git_commit(repo_dir)


# ============================
# Make threadpoolctl safe so MLflow’s autologger won’t crash ───
# ============================

In [15]:
# ─── Patch threadpoolctl if needed to avoid autolog crashes ───
try:
    import threadpoolctl
    _original_threadpool_info = threadpoolctl.threadpool_info

    def _safe_threadpool_info(*args, **kwargs):
        try:
            return _original_threadpool_info(*args, **kwargs)
        except Exception:
            return []

    threadpoolctl.threadpool_info = _safe_threadpool_info
except ImportError:
    pass  # If threadpoolctl isn't installed, we just skip this patch

# ─── Enable MLflow autologging (generic, works with sklearn and more) ───
import mlflow

mlflow.autolog(
    log_input_examples=True,
    log_model_signatures=True
)


2025/05/18 16:59:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/18 16:59:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [16]:
import hashlib
import json
import platform
import psutil
import numpy as np
import pandas as pd
from datetime import datetime
from subprocess import check_output, CalledProcessError

def log_standard_metadata(
    model_name: str,
    model,
    hyperparams: dict,
    acc: float,
    prec: float,
    rec: float,
    f1: float,
    auc: float,
    label_map: dict,
    run_id: str,
    test_size: float,
    random_state: int,
    id_cols: list,
    target_col: str,
    X,
    y,
    run_data=None
):
    # === Experiment Metadata ===
    mlflow.set_tag("run_id", run_id)  # [MLflow / DB anchor]
    mlflow.set_tag("model_name", model_name)  # [ML Metadata, FAIR]
    mlflow.set_tag("model_architecture", model.__class__.__name__)  # [MLSEA]
    mlflow.set_tag("test_size", test_size)  # [MLSEA, Reproducibility]
    mlflow.set_tag("random_state", random_state)  # [MLSEA, Reproducibility]

    # === Evaluation Metrics ===
    mlflow.set_tag("accuracy", acc)
    mlflow.set_tag("precision_macro", prec)
    mlflow.set_tag("recall_macro", rec)
    mlflow.set_tag("f1_macro", f1)
    mlflow.set_tag("roc_auc", auc)

    # === Hyperparameters and Label Encoding ===
    mlflow.set_tag("hyperparameters", json.dumps(hyperparams))  # [FAIR, MLSEA]
    mlflow.set_tag("label_map", json.dumps(label_map))  # [ML Preprocessing]

    # === Preprocessing Snapshot ===
    preprocessing_info = {
        "dropped_columns": id_cols,
        "numeric_columns": list(X.columns),
        "target_column": target_col,
        "stratified": False,
        "coercion_strategy": "Numeric cast (auto)",
        "feature_engineering": "None",
        "missing_value_strategy": "None",
        "outlier_detection": "None",
        "encoding_strategy": "LabelEncoder (target only)",
        "scaling": "None",
        "sampling": "None",
        "feature_selection": "None",
        "train_test_split": {"test_size": test_size, "random_state": random_state},
        "imbalance_ratio": str(dict(zip(*np.unique(y, return_counts=True)))),
        "preprocessing_timestamp": datetime.now().isoformat()
    }
    preprocessing_hash = hashlib.sha256(json.dumps(preprocessing_info).encode()).hexdigest()
    mlflow.set_tag("preprocessing_info", json.dumps(preprocessing_info))  # [MLSEA]
    mlflow.set_tag("preprocessing_hash", preprocessing_hash)

    # === Reproducibility ===
    mlflow.set_tag("model_serialization", "pickle")  # [FAIR, MLSEA]
    mlflow.set_tag("model_path", f"{model_name}.pkl")

    try:
        sha = check_output(["git", "rev-parse", "HEAD"], text=True).strip()
    except CalledProcessError:
        sha = "unknown"
    mlflow.set_tag("git_commit", sha)

    # === Compute Environment ===
    compute_env = {
        "os": f"{platform.system()} {platform.release()}",
        "cpu": platform.processor(),
        "ram_gb": round(psutil.virtual_memory().total / (1024 ** 3), 2),
        "python_version": platform.python_version(),
        "sklearn_version": sklearn.__version__,
        "pandas_version": pd.__version__,
        "numpy_version": np.__version__,
    }
    mlflow.set_tag("compute_environment", json.dumps(compute_env))  # [Reproducibility]

    # === Optional: Tag MLflow Justifications (previously logged manually) ===
    if run_data:
        for key, val in run_data.tags.items():
            if key.startswith("justification_"):
                mlflow.set_tag(key, val)


In [17]:
import os
import yaml
import numpy as np
from datetime import datetime

def generate_reproducibility_txt_log(
    model_name: str,
    dataset_name: str,
    dataset_version: str,
    hyperparams: dict,
    metrics: dict,
    git_commit: str,
    run_id: str,
    architecture_file_path: str = "provenance_architecture_description.txt"
) -> str:
    """
    Generate a reproducibility log (YAML + architecture) and return the saved path.
    This log combines:
    - Model and dataset details
    - Hyperparameters and evaluation metrics
    - Git provenance info
    - Reproduction steps
    - Provenance architecture description
    """

    def clean_values(d):
        """Convert numpy floats to native floats."""
        return {k: float(v) if isinstance(v, (np.float32, np.float64)) else v for k, v in d.items()}

    timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")

    repro_data = {
        "📌 Model Details": {
            "Model Name": model_name,
            "Dataset Name": dataset_name,
            "Dataset Version": dataset_version,
            "Run ID": run_id,
            "Timestamp": timestamp
        },
        "🛠️ Hyperparameters": clean_values(hyperparams),
        "📈 Metrics": clean_values(metrics),
        "🔗 Git Info": {
            "Commit Hash": git_commit,
            "Reproduce With": f"git checkout {git_commit}"
        },
        "🚀 Reproduction Guide": [
            "1. Clone the repo and checkout the commit:",
            f"   git checkout {git_commit}",
            "2. Load and preprocess the dataset exactly as during training.",
            "3. Load the model using MLflow:",
            f"   mlflow.sklearn.load_model('runs:/{run_id}/model')",
            "4. Run inference or evaluation using the same pipeline/script."
        ]
    }

    # 🔐 Create and write to output file
    save_dir = os.path.join("MODEL_PROVENANCE", model_name)
    os.makedirs(save_dir, exist_ok=True)
    txt_path = os.path.join(save_dir, f"{model_name}_reproducibility.txt")

    with open(txt_path, "w", encoding="utf-8") as repro_file:
        yaml.dump(repro_data, repro_file, allow_unicode=True, sort_keys=False, width=100)
        repro_file.write("\n\n")

        if os.path.exists(architecture_file_path):
            with open(architecture_file_path, "r", encoding="utf-8") as arch_file:
                architecture_description = arch_file.read()
                repro_file.write(architecture_description)
        else:
            repro_file.write("[⚠️ Missing architecture description file]\n")

    return txt_path


In [18]:
def log_with_justification(log_func, key: str, value, context: str = ""):
    """
    Log a value using the specified MLflow log function (e.g., mlflow.log_param),
    then prompt the user for a justification and log it as a tag.
    """
    log_func(key, value)
    print(f"\n📝 Justification for `{key}` ({context})")
    user_reason = input("→ Why did you choose this value? ")
    mlflow.set_tag(f"justification_{key}", user_reason or "No justification provided")

def log_justification(key: str, question: str):
    """
    Prompt for a justification only (without logging a value), and log it as a tag.
    """
    print(f"\n📝 Justification for `{key}`")
    user_reason = input(f"→ {question} ")
    mlflow.set_tag(f"justification_{key}", user_reason or "No justification provided")


In [19]:
# mlflow.end_run()

# ============================
# 🚀 Start MLflow Run 
# ============================

In [21]:
import hashlib
from datetime import datetime
import mlflow
import pandas as pd

with mlflow.start_run() as run:
    client = MlflowClient()
    run_data = client.get_run(run.info.run_id).data
    # ─────────────── Session Metadata ─────────────────────────────────────
    session_metadata = collect_session_metadata(prompt_fields=True)
    mlflow.log_params(session_metadata)  # [PROV, Internal] Session and environment context

    # ─────────────── Dataset Metadata ─────────────────────────────────────
    doi_metadata = extract_dataset_metadata_from_doi("10.24432/C56C76")  # [FAIR, PROV, FAIR4ML]

    # ─────────────── Experiment Start Time ────────────────────────────────
    start_time = datetime.now().isoformat()
    mlflow.set_tag("startedAtTime", start_time)  # [PROV] Activity start time

    #######################################################################
    ### Preprocessing #####################################################

    # ── Load into a DataFrame ────────────────────────────────────────────
    df = pd.DataFrame(dataset)
    original_row_count = df.shape[0]
    mlflow.log_param("input_row_count", original_row_count)  # [MLSEA] Input data size

    # Log column names before transformation
    mlflow.set_tag("raw_columns", ','.join(df.columns))  # [FAIR4ML, Internal]

    # ── Generate row hashes ───────────────────────────────────────────────
    before_hashes = set(df.astype(str).apply(lambda row: hash(tuple(row)), axis=1))
    mlflow.set_tag("row_hash_tracking", "enabled")  # [Internal] Used for provenance/repeatability

    # ── Extract target variable ───────────────────────────────────────────
    target_col = df.columns[-1]
    mlflow.set_tag("target_variable", target_col)  # [FAIR4ML, MLSEA]

    # ── Separate features and labels ──────────────────────────────────────
    y = df[target_col]
    X = df.drop(columns=[target_col])
    mlflow.set_tag("feature_columns", ','.join(X.columns))  # [FAIR4ML, MLSEA]

    # ── Drop ID columns (case-insensitive) ────────────────────────────────
    id_cols = [c for c in X.columns if c.lower() == "id"]
    if id_cols:
        X = X.drop(columns=id_cols)
        mlflow.set_tag("dropped_id_columns", ','.join(id_cols))  # [Internal]

    # ── Convert columns to numeric where possible ─────────────────────────
    numeric_conversion_count = 0
    for c in X.columns:
        try:
            X[c] = pd.to_numeric(X[c])
            numeric_conversion_count += 1
        except Exception:
            continue
    mlflow.log_param("numeric_columns_converted", numeric_conversion_count)  # [Internal, FAIR4ML]

    # ── Print diagnostic info ─────────────────────────────────────────────
    print("ML_EXP_Shapes:", X.shape, y.shape)
    mlflow.log_param("feature_matrix_shape", str(X.shape))  # [MLSEA]
    mlflow.log_param("label_vector_shape", str(y.shape))    # [MLSEA]
#######################################################################################################
### 8) Label Encoding and Metadata Logging ############################################################

# ── Encode class labels numerically ────────────────────────────────────────────────────────
    le = LabelEncoder()
    y = le.fit_transform(y)
    print("ML_EXP_Classes:", le.classes_)
    
    mlflow.set_tag("class_names", ','.join(le.classes_))  # [FAIR4ML, MLSEA]
    
    # ── Count rows and hash comparison before vs after preprocessing ────────────────────────────
    count_end = df.shape[0]
    after_hashes = set(df.astype(str).apply(lambda row: hash(tuple(row)), axis=1))
    
    n_insert = len(after_hashes - before_hashes)
    n_delete = len(before_hashes - after_hashes)
    
    #######################################################################################################
    ### Metadata Logging (Standardized Format) ############################################################
    
    # ── Extended DB Metadata ────────────────────────────────────────────────────────────────────
    db_meta = fetch_db_dataset_metadata(db_id, selected_table_id, selected_version, target_col, df.shape[0])  # [Internal]
    
    mlflow.set_tag("Internal_DBRepo_table_last_modified", db_meta.get("dataset_publication_date", "unknown"))
  # [PROV]
    
    # ── Row Count Metrics ────────────────────────────────────────────────────────────────────────
    mlflow.log_metric("row_count_start", original_row_count)              # [MLSEA, FAIR4ML]
    mlflow.log_metric("row_count_end", count_end)                  # [MLSEA, FAIR4ML]
    mlflow.log_metric("num_inserted_rows", n_insert)               # [PROV]
    mlflow.log_metric("num_deleted_rows", n_delete)                # [PROV]
    
    # ── Raw Data Source Metadata ─────────────────────────────────────────────────────────────────
    mlflow.set_tag("data_source", API_URL)                         # [FAIR]
    mlflow.log_param("retrieval_time_utc", datetime.utcnow().isoformat())  # [PROV]
    mlflow.log_param("raw_row_count", len(df))                     # [MLSEA]
    mlflow.log_param("raw_columns", df.columns.tolist())           # [FAIR4ML]
    mlflow.log_param("dropped_columns", id_cols)                   # [Internal]
    
    # ── Post-Processing Metadata ─────────────────────────────────────────────────────────────────
    mlflow.log_param("final_num_features", X.shape[1])             # [MLSEA]
    mlflow.log_param("final_feature_names", X.columns.tolist())    # [FAIR4ML]
    mlflow.set_tag("target_variable_encoded", target_col)          # [FAIR4ML]
    
    # ── Label Mapping as Artifact ───────────────────────────────────────────────────────────────
    label_map = {int(idx): cls for idx, cls in enumerate(le.classes_)}
    buffer = io.StringIO()
    json.dump(label_map, buffer, indent=2)
    buffer.seek(0)
    mlflow.log_text(buffer.getvalue(), artifact_file="label_mapping.json")  # [FAIR4ML]
    
    # ── Training Metadata ────────────────────────────────────────────────────────────────────────
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_name = f"RandomForest_Iris_v{ts}"
    mlflow.set_tag("model_name", model_name)                       # [MLSEA]
    
    train_start_ts = datetime.now().isoformat()
    mlflow.set_tag("training_start_time", train_start_ts)          # [PROV]
########################################################################################################
### Model Parameters & Split Metadata ##################################################################

# ── Prompt test size and seed ─────────────────────────────────────────────────────────────────────────
    try:
        test_size = float(input("Enter test size (e.g., 0.2 for 20% test set): "))
    except ValueError:
        print("Invalid input. Defaulting to 0.2")
        test_size = 0.2
    
    try:
        random_state = int(input("Enter random seed (e.g., 42): "))
    except ValueError:
        print("Invalid input. Defaulting to 42")
        random_state = 42
    
    # ── Train/test split ────────────────────────────────────────────────────────────────────────────────
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # ── Log split config ────────────────────────────────────────────────────────────────────────────────
    mlflow.log_param("test_size", test_size)                     # [MLSEA]
    mlflow.log_param("random_seed", random_state)               # [PROV]
    mlflow.log_param("n_train_samples", X_train.shape[0])       # [FAIR4ML]
    mlflow.log_param("n_test_samples",  X_test.shape[0])        # [FAIR4ML]
    mlflow.log_param("n_features",      X_train.shape[1])       # [MLSEA]
    
    ########################################################################################################
    ### Model Selection & Hyperparameters ##################################################################
    
    # ── Define hyperparameters ───────────────────────────────────────────────────────────────────────────
    ML_EXP_hyperparams = {
        "n_estimators":       100,
        "criterion":          "entropy",
        "max_depth":          10,
        "min_samples_split":  3,
        "min_samples_leaf":   1,
        "max_features":       "sqrt",
        "bootstrap":          True,
        "oob_score":          True,
        "class_weight":       None,
        "verbose":            1,
        "n_jobs":             -1
    }
    
    # ── Model selection ───────────────────────────────────────────────────────────────────────────────────
    available_models = {
        "random_forest": RandomForestClassifier,
        "decision_tree": DecisionTreeClassifier,
        "logistic_regression": LogisticRegression,
        "knn": KNeighborsClassifier,
        "svm": SVC,
        "gradient_boosting": GradientBoostingClassifier
    }
    
    # User prompt
    print("Choose a model to train:")
    for i, name in enumerate(available_models.keys()):
        print(f"{i + 1}. {name}")
    
    choice = input("Enter model number (default 1 for random_forest): ").strip()
    choice = int(choice) if choice else 1
    selected_key = list(available_models.keys())[choice - 1]
    selected_model_class = available_models[selected_key]
    mlflow.set_tag("selected_model", selected_key)  # [FAIR4ML, MLSEA]
    
    # ── Initialize model ────────────────────────────────────────────────────────────────────────────────
    model = selected_model_class(**ML_EXP_hyperparams)
    
    # ── Log hyperparameters with justification ───────────────────────────────────────────────────────────
    for key, val in ML_EXP_hyperparams.items():
        log_with_justification(mlflow.log_param, key, val, context="Hyperparameter configuration")  # [FAIR4ML, MLSEA]
    
    ########################################################################################################
    ### Model Training & Evaluation ########################################################################
    
    # ── Fit the model ────────────────────────────────────────────────────────────────────────────────────
    model.fit(X_train, y_train)
    train_end_ts = datetime.now().isoformat()
    mlflow.set_tag("training_end_time", train_end_ts)  # [PROV]
    
    # ── Predictions ──────────────────────────────────────────────────────────────────────────────────────
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    # ── Compute and log metrics ─────────────────────────────────────────────────────────────────────────
    acc  = accuracy_score(y_test, y_pred)
    auc  = roc_auc_score(y_test, y_proba, multi_class="ovr")
    prec = precision_score(y_test, y_pred, average="macro")
    rec  = recall_score(y_test,  y_pred, average="macro")
    f1   = f1_score(y_test,      y_pred, average="macro")
    
    mlflow.log_metric("accuracy", acc)              # [MLSEA]
    mlflow.log_metric("roc_auc", auc)               # [MLSEA]
    mlflow.log_metric("precision_macro", prec)      # [MLSEA]
    mlflow.log_metric("recall_macro", rec)          # [MLSEA]
    mlflow.log_metric("f1_macro", f1)               # [MLSEA]


########################################################################################################
### Final Logging: Justifications, Metrics, Environment, Dataset Metadata #############################

# ── Prompt for and log justifications ────────────────────────────────────────────────────────────────
    log_justification("model_choice", "Why did you choose this model (e.g., RandomForestClassifier) for this task?")
    log_justification("target_variable", "Why did you choose this column as the prediction target?")
    log_justification("test_split", "Why this train/test ratio (e.g., 80/20)?")
    log_justification("metric_choice", "Why did you use accuracy/f1/ROC-AUC as your evaluation metric?")
    log_justification("threshold_accuracy", "Was there a threshold for accuracy? Why?")
    log_justification("dataset_version", "Why did you use this specific dataset version?")
    log_justification("drop_column_X", "Why did you drop any specific columns from the dataset?")
    log_justification("experiment_name", "Any context behind this experiment name or setup?")
    log_justification("model_limitations", "Any known model limitations?")
    log_justification("ethical_considerations", "Any known model ethical considerations?")
    log_justification("intended_use", "Known model intended use?")
    log_justification("not_intended_for", "Model not_intended_for?")


    # ── Log model evaluation metrics ────────────────────────────────────────────────────────────────────
    mlflow.log_metric("precision_macro", prec)    # [MLSEA]
    mlflow.log_metric("recall_macro", rec)        # [MLSEA]
    mlflow.log_metric("f1_macro", f1)             # [MLSEA]
    mlflow.log_metric("accuracy", acc)            # [MLSEA]
    mlflow.log_metric("roc_auc", auc)             # [MLSEA]
    
    # ── Log environment info ─────────────────────────────────────────────────────────────────────────────
    mlflow.log_params({
        "python_version":       platform.python_version(),
        "os_platform":          f"{platform.system()} {platform.release()}",
        "sklearn_version":      sklearn.__version__,
        "pandas_version":       pd.__version__,
        "numpy_version":        np.__version__,
        "matplotlib_version":   matplotlib.__version__,
        "seaborn_version":      sns.__version__,
        "shap_version":         shap.__version__,
    })  # [PROV, Internal]
    
    # ── Tag notebook name ────────────────────────────────────────────────────────────────────────────────
    mlflow.set_tag("notebook_name", "RQ1_2.ipynb")  # [Internal]
    
    # ── Dataset metadata tags ────────────────────────────────────────────────────────────────────────────
    mlflow.set_tag("dataset_name",    db_meta.get("dataset_name", "unknown") )    # [FAIR4ML, PROV]
    mlflow.set_tag("dataset_version", selected_version)                                           # [FAIR4ML, Internal]
    mlflow.set_tag("dataset_id",      selected_table_id)  # [FAIR4ML, Internal]

########################################################################################################
### Plots: Feature Importance, ROC, PR, Confusion Matrix, SHAP #########################################

# ── Create plot output directory ─────────────────────────────────────────────────────────────────────
    plot_dir = os.path.join("ML_EXP_plots", run.info.run_id) ##TODO test this path change
    os.makedirs(plot_dir, exist_ok=True)
    
    # ── 1) Feature Importance Bar Chart ──────────────────────────────────────────────────────────────────
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        feature_names = getattr(X_train, "columns", [f"f{i}" for i in range(X_train.shape[1])])
        
        fi_path = os.path.join(plot_dir, "feature_importances.png")
        plt.figure(figsize=(8, 6))
        sns.barplot(x=importances, y=feature_names)
        plt.title("Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.tight_layout()
        plt.savefig(fi_path)
        mlflow.log_artifact(fi_path)  # [MLSEA]
        plt.close()
    
    # ── 2) Multi-class ROC Curves ───────────────────────────────────────────────────────────────────────
    classes = np.unique(y_test)
    y_test_bin = label_binarize(y_test, classes=classes)
    
    for idx, cls in enumerate(classes):
        disp = RocCurveDisplay.from_predictions(y_test_bin[:, idx], y_proba[:, idx], name=f"ROC for class {cls}")
        roc_path = os.path.join(plot_dir, f"roc_curve_cls_{cls}.png")
        disp.figure_.savefig(roc_path)
        mlflow.log_artifact(roc_path)  # [MLSEA]
        plt.close(disp.figure_)
    
    # ── 3) Multi-class Precision-Recall Curves ───────────────────────────────────────────────────────────
    for idx, cls in enumerate(classes):
        disp = PrecisionRecallDisplay.from_predictions(y_test_bin[:, idx], y_proba[:, idx], name=f"PR curve for class {cls}")
        pr_path = os.path.join(plot_dir, f"pr_curve_cls_{cls}.png")
        disp.figure_.savefig(pr_path)
        mlflow.log_artifact(pr_path)  # [MLSEA]
        plt.close(disp.figure_)
    
    # ── 4) Confusion Matrix Plot ─────────────────────────────────────────────────────────────────────────
    cm_path = os.path.join(plot_dir, "confusion_matrix.png")
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)  # [MLSEA]
    plt.close()
    
    # ── 5) SHAP Summary Plot ─────────────────────────────────────────────────────────────────────────────
    shap_path = os.path.join(plot_dir, "shap_summary.png")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    shap.summary_plot(shap_values, X_test, show=False)
    plt.tight_layout()
    plt.savefig(shap_path)
    mlflow.log_artifact(shap_path)  # [FAIR4ML, MLSEA]
    plt.close()
    
    ########################################################################################################
    ### Final: Metadata Summary Logging ####################################################################
    
    
    
    # log_standard_metadata(
    #     model_name=model_name,
    #     model=model,
    #     hyperparams=ML_EXP_hyperparams,
    #     acc=acc,
    #     prec=prec,
    #     rec=rec,
    #     f1=f1,
    #     auc=auc,
    #     label_map=label_map,
    #     run_id=run.info.run_id,
    #     test_size=test_size,
    #     random_state=random_state,
    #     run_data=run_data
    # )
    log_standard_metadata(
    model_name=model_name,
    model=model,
    hyperparams=ML_EXP_hyperparams,
    acc=acc,
    prec=prec,
    rec=rec,
    f1=f1,
    auc=auc,
    label_map=label_map,
    run_id=run.info.run_id,
    test_size=test_size,
    random_state=random_state,
    id_cols=id_cols,         # ✅ list of dropped ID columns
    target_col=target_col,   # ✅ your target column, likely defined as df.columns[-1]
    X=X,                     # ✅ your features DataFrame
    y=y,                     # ✅ your labels array or Series
    run_data=run_data        # optional but useful
    )

########################################################################################################
### Export Model (.pkl) and Log as Artifact ############################################################

# ── Define output path ───────────────────────────────────────────────────────────────────────────────
    pkl_path = f"Trained_models/{model_name}.pkl"
    os.makedirs("Trained_models", exist_ok=True)  # Ensure the folder exists
    
    # ── Serialize the trained model to disk ──────────────────────────────────────────────────────────────
    with open(pkl_path, "wb") as f:
        pickle.dump(model, f)
    
    # ── Log the serialized model to MLflow as an artifact ────────────────────────────────────────────────
    mlflow.log_artifact(pkl_path, artifact_path=model_name)  # [FAIR4ML, MLSEA]

########################################################################################################
### COMMIT: Git Integration + Provenance Logging #######################################################

    def get_latest_commit_hash(repo_path="."):
        res = subprocess.run(
            ["git", "-C", repo_path, "rev-parse", "HEAD"],
            capture_output=True, text=True, check=True
        )
        return res.stdout.strip()
    
    def get_remote_url(repo_path=".", remote="origin"):
        res = subprocess.run(
            ["git", "-C", repo_path, "config", "--get", f"remote.{remote}.url"],
            capture_output=True, text=True, check=True
        )
        return res.stdout.strip()
    
    def make_commit_link(remote_url, commit_hash):
        base = remote_url.rstrip(".git")
        if base.startswith("git@"):
            base = base.replace(":", "/").replace("git@", "https://")
        return f"{base}/commit/{commit_hash}"
    
    def simple_commit_and_push_and_log(repo_path=".", message="Auto commit", remote="origin", branch="main"):
        status = subprocess.run(["git", "-C", repo_path, "status", "--porcelain"], capture_output=True, text=True)
        if not status.stdout.strip():
            print("🟡 No changes to commit.")
            return None, None
    
        subprocess.run(["git", "-C", repo_path, "add", "--all"], capture_output=True, text=True)
        commit = subprocess.run(["git", "-C", repo_path, "commit", "-m", message], capture_output=True, text=True)
        if commit.returncode:
            print("❌ git commit failed:\n", commit.stderr)
            return None, None
        print("✅ Commit successful.")
    
        push = subprocess.run(["git", "-C", repo_path, "push", "-u", remote, branch], capture_output=True, text=True)
        if push.returncode:
            print("❌ git push failed:\n", push.stderr)
        else:
            print("🚀 Push successful.")
    
        sha = get_latest_commit_hash(repo_path)
        url = get_remote_url(repo_path, remote)
        link = make_commit_link(url, sha)
        return sha, link
    
    # ── Perform commit and get commit SHA and link ───────────────────────────────────────────────────────
    sha, link = simple_commit_and_push_and_log(
        repo_path=".",
        message="Auto commit after successful training"
    )
    
    # ── Ask for version tag and log it ───────────────────────────────────────────────────────────────────
    def get_version_tag_for_commit(commit_hash, known_tags=None):
        known_tags = known_tags or {}
        version_tag = known_tags.get(commit_hash, "untagged")
        if version_tag == "untagged":
            print(f"⚠️ Commit {commit_hash[:8]} is not tagged with a version.")
            user_input = input("🔖 Enter version tag for this commit (or press Enter to skip): ").strip()
            version_tag = user_input if user_input else "untagged"
        return commit_hash, version_tag
    
    commit, version_tag = get_version_tag_for_commit(sha)
    mlflow.set_tag("GIT_code_version", version_tag)  # [PROV]
    mlflow.set_tag("model_version", version_tag)  # [PROV]

    
    
    # ── Log author info ──────────────────────────────────────────────────────────────────────────────────
    def get_git_author():
        name = subprocess.check_output(["git", "config", "user.name"]).decode().strip()
        email = subprocess.check_output(["git", "config", "user.email"]).decode().strip()
        return name, email
    
    name, email = get_git_author()
    mlflow.set_tag("GIT_user", name)               # [PROV]
    mlflow.set_tag("GIT_user_email", email)        # [PROV]
    
    # ── Log Git diff between this and previous commit ────────────────────────────────────────────────────
    if sha and link:
        previous_commit_hash = db_meta.get("code_commit_hash", "")  # Fallback for comparison
        if previous_commit_hash:
            diff_text = subprocess.check_output(
                ["git", "-C", ".", "diff", previous_commit_hash, sha],
                encoding="utf-8", errors="ignore"
            )
    
            remote_url = get_remote_url(".")
            remote_url = remote_url.rstrip(".git")
            if remote_url.startswith("git@"):
                remote_url = remote_url.replace(":", "/").replace("git@", "https://")
    
            previous_commit_url = f"{remote_url}/commit/{previous_commit_hash}"
            current_commit_url  = f"{remote_url}/commit/{sha}"
    
            diff_data = {
                "GIT_previous_commit":        previous_commit_hash,
                "GIT_previous_commit_url":    previous_commit_url,
                "GIT_current_commit":         sha,
                "GIT_current_commit_url":     current_commit_url,
                "GIT_diff":                   diff_text
            }
    
            mlflow.log_dict(diff_data, artifact_file="GIT_commit_diff.json")  # [PROV]
            mlflow.set_tag("GIT_previous_commit_hash", previous_commit_hash)
            mlflow.set_tag("GIT_current_commit_hash", sha)
            mlflow.set_tag("GIT_current_commit_url", link)
########################################################################################################
### Reproducibility Metadata Extraction + Text Log #####################################################

# ── Log all categorized metadata (FAIR, PROV, DBRepo, etc.) ───────────────────────────────────────────
    # log_metadata_dict_to_mlflow(categorized_fields)  # [FAIR4ML, PROV, Internal]

    log_metadata_dict_to_mlflow(
        metadata=doi_metadata,
        prefix="DOI_",
        snapshot_name="doi_metadata_snapshot.json"
    )
    # ── Retrieve full run metadata ───────────────────────────────────────────────────────────────────────
    run_id    = run.info.run_id
    run_info  = client.get_run(run_id).info
    run_data  = client.get_run(run_id).data
    
    params  = dict(run_data.params)
    metrics = dict(run_data.metrics)
    tags    = dict(run_data.tags)
    
    # ── List all artifacts in the run ────────────────────────────────────────────────────────────────────
    artifact_uri  = run_info.artifact_uri
    artifact_meta = []
    
    def _gather(path=""):
        for af in client.list_artifacts(run_id, path):
            if af.is_dir:
                _gather(af.path)
            else:
                rel_path = af.path.lower()
                if rel_path.endswith((".json", ".txt", ".patch")):
                    artifact_meta.append({"path": af.path, "type": "text"})
                elif rel_path.endswith((".png", ".jpg", ".jpeg", ".svg")):
                    artifact_meta.append({"path": af.path, "type": "image"})
                else:
                    artifact_meta.append({"path": af.path, "type": "other"})
    
    _gather()
    
    # ── (Optional) Store artifact meta if needed ─────────────────────────────────────────────────────────
    mlflow.log_dict({"artifacts": artifact_meta}, artifact_file="artifact_summary.json")  # [Internal]
    
    # ── Notebook directory (for trace/log location reference) ────────────────────────────────────────────
    notebook_dir = os.getcwd()
    
    ########################################################################################################
    ### Generate Reproducibility Instructions ##############################################################
    
    # ── Generate reproducibility .txt log with key details ───────────────────────────────────────────────
    repro_txt_path = generate_reproducibility_txt_log(
        model_name=model_name,
        dataset_name=db_meta.get("dataset_name", "unknown"),
        dataset_version=selected_version,
        hyperparams=ML_EXP_hyperparams,
        metrics={
            "accuracy": acc,
            "f1_macro": f1,
            "precision_macro": prec,
            "recall_macro": rec,
            "roc_auc": auc
        },
        git_commit=sha,
        run_id=run_id
    )
    
    # ── Log the .txt path to MLflow for traceability ─────────────────────────────────────────────────────
    mlflow.log_param("reproducibility_log_path", repro_txt_path)  # [Internal, FAIR4ML]
########################################################################################################
### COMBINE: Export Full Run Summary as JSON ###########################################################

# ── Create output directory ──────────────────────────────────────────────────────────────────────────
    summary_dir = os.path.join(os.getcwd(), "MODEL_PROVENANCE", model_name)
    os.makedirs(summary_dir, exist_ok=True)
    
    # ── Prepare run summary dict ─────────────────────────────────────────────────────────────────────────
    summary = {
        "run_id":         run_id,
        "run_name":       run_info.run_name,
        "experiment_id":  run_info.experiment_id,
        "start_time":     run_info.start_time,
        "end_time":       run_info.end_time,
        "params":         params,
        "metrics":        metrics,
        "tags":           tags,
        "artifacts":      artifact_meta
    }
    
    # ── Write summary to JSON file ───────────────────────────────────────────────────────────────────────
    summary_filename    = f"{model_name}_run_summary.json"
    summary_local_path  = os.path.join(summary_dir, summary_filename)
    
    with open(summary_local_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)
    
    # ── Log summary JSON to MLflow ───────────────────────────────────────────────────────────────────────
    mlflow.log_artifact(summary_local_path, artifact_path="run_summaries")  # [FAIR4ML, Internal]
    print("📁 Run summary JSON logged at:", summary_local_path)
    
    # ── End MLflow run with PROV-O end timestamp ─────────────────────────────────────────────────────────
    end_time = datetime.now().isoformat()
    mlflow.set_tag("endedAtTime", end_time)  # [PROV]
    mlflow.end_run()


Enter your role (default: collaborator):  dsf
Enter project ID (default: default_project):  fsdf



📌 Session Metadata:
  session_id: a703d999-0a29-4cfb-8dab-48d1f14d1bff
  username: reema
  timestamp_utc: 2025-05-18T15:03:47.945054
  hostname: Purplish
  platform: Windows
  os_version: 10.0.26100
  python_version: 3.11.5
  role: dsf
  project_id: fsdf
ML_EXP_Shapes: (150, 4) (150,)
ML_EXP_Classes: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
{'id': 'f6a329d0-0cd6-4308-8c89-ad5763b42324', 'name': 'Iris', 'description': None, 'tables': [{'id': 'e435fe7c-f889-40f5-9660-007597ad4a5b', 'name': 'iris_data_v2', 'alias': None, 'identifiers': [], 'owner': {'id': '55bad3e0-c815-103f-9c9e-43ec87b6b872', 'username': 'reema', 'name': 'reema dass', 'orcid': None, 'qualified_name': 'reema dass — @reema', 'given_name': 'reema', 'family_name': 'dass'}, 'description': 'iris_data_v2', 'columns': [{'id': '9208e04f-b09c-4d43-9ddc-a41cffc55840', 'name': 'Id', 'alias': None, 'size': None, 'd': None, 'mean': 51, 'median': 51, 'concept': None, 'unit': None, 'description': None, 'enums': [], 'sets': [

Enter test size (e.g., 0.2 for 20% test set):  fssf


Invalid input. Defaulting to 0.2


Enter random seed (e.g., 42):  fsvd


Invalid input. Defaulting to 42
Choose a model to train:
1. random_forest
2. decision_tree
3. logistic_regression
4. knn
5. svm
6. gradient_boosting


Enter model number (default 1 for random_forest):  



📝 Justification for `n_estimators` (Hyperparameter configuration)


→ Why did you choose this value?  bd



📝 Justification for `criterion` (Hyperparameter configuration)


→ Why did you choose this value?  fbd



📝 Justification for `max_depth` (Hyperparameter configuration)


→ Why did you choose this value?  sffffffffffff



📝 Justification for `min_samples_split` (Hyperparameter configuration)


→ Why did you choose this value?  fvdf



📝 Justification for `min_samples_leaf` (Hyperparameter configuration)


→ Why did you choose this value?  dbfd



📝 Justification for `max_features` (Hyperparameter configuration)


→ Why did you choose this value?  vcxbs



📝 Justification for `bootstrap` (Hyperparameter configuration)


→ Why did you choose this value?  dssf



📝 Justification for `oob_score` (Hyperparameter configuration)


→ Why did you choose this value?  fddfb



📝 Justification for `class_weight` (Hyperparameter configuration)


→ Why did you choose this value?  fbdbd



📝 Justification for `verbose` (Hyperparameter configuration)


→ Why did you choose this value?  fbbbd



📝 Justification for `n_jobs` (Hyperparameter configuration)


→ Why did you choose this value?  dfbd


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished



📝 Justification for `model_choice`


→ Why did you choose this model (e.g., RandomForestClassifier) for this task?  fdd



📝 Justification for `target_variable`


→ Why did you choose this column as the prediction target?  gdbvd



📝 Justification for `test_split`


→ Why this train/test ratio (e.g., 80/20)?  dfvdf



📝 Justification for `metric_choice`


→ Why did you use accuracy/f1/ROC-AUC as your evaluation metric?  dfbd



📝 Justification for `threshold_accuracy`


→ Was there a threshold for accuracy? Why?  dbdf



📝 Justification for `dataset_version`


→ Why did you use this specific dataset version?  dbdb



📝 Justification for `drop_column_X`


→ Why did you drop any specific columns from the dataset?  dbdb



📝 Justification for `experiment_name`


→ Any context behind this experiment name or setup?  ddb



📝 Justification for `model_limitations`


→ Any known model limitations?  dbdb



📝 Justification for `ethical_considerations`


→ Any known model ethical considerations?  dbdbd



📝 Justification for `intended_use`


→ Known model intended use?  dbdb



📝 Justification for `not_intended_for`


→ Model not_intended_for?  db


  pl.tight_layout()
  pl.tight_layout(pad=0, w_pad=0, h_pad=0.0)
  plt.tight_layout()


✅ Commit successful.
🚀 Push successful.
⚠️ Commit c53df165 is not tagged with a version.


🔖 Enter version tag for this commit (or press Enter to skip):  v99


✅ Logged tag: DOI_dataset_id
✅ Logged tag: DOI_dataset_title
✅ Logged tag: DOI_dataset_description
✅ Logged tag: DOI_dataset_creator
✅ Logged tag: DOI_dataset_publisher
✅ Logged tag: DOI_dataset_publication_date
✅ Logged tag: DOI_dataset_license
✅ Logged tag: DOI_dataset_keywords
✅ Logged tag: DOI_dataset_access_url
✅ Logged tag: DOI_dataset_documentation
✅ Logged tag: DOI_metadata_standard
✅ Logged tag: DOI_related_resources
✅ Logged tag: DOI_prov_entity
✅ Logged tag: DOI_prov_activity
✅ Logged tag: DOI_prov_agent_dataset_creator
✅ Logged tag: DOI_prov_used
✅ Logged tag: DOI_prov_wasDerivedFrom
✅ Logged tag: DOI_prov_wasAttributedTo
✅ Logged tag: DOI_prov_startedAtTime
✅ Logged tag: DOI_prov_role_dataset_creator
✅ Logged tag: DOI_prov_role_database_creator
📁 Full metadata snapshot logged as: doi_metadata_snapshot.json
📁 Run summary JSON logged at: C:\Users\reema\REPO\notebooks\RQ_notebooks\MODEL_PROVENANCE\RandomForest_Iris_v20250518_170352\RandomForest_Iris_v20250518_170352_run_summa

In [22]:
summary_local_path

'C:\\Users\\reema\\REPO\\notebooks\\RQ_notebooks\\MODEL_PROVENANCE\\RandomForest_Iris_v20250518_170352\\RandomForest_Iris_v20250518_170352_run_summary.json'

In [63]:

required_keywords = [
    "mlflow", "scikit-learn", "pandas", "numpy", "pyyaml", "seaborn",
    "matplotlib", "shap", "rdflib", "requests", "python-dotenv", "gitpython", "psutil", "pyld"
]

# Run pip freeze
result = subprocess.run(["pip", "freeze"], stdout=subprocess.PIPE, text=True)
all_packages = result.stdout.splitlines()

# Filter based on matching names
filtered = [pkg for pkg in all_packages if any(kw.lower() in pkg.lower() for kw in required_keywords)]

# Save filtered requirements to file
filtered_requirements_path = "requirements.txt"
with open(filtered_requirements_path, "w") as f:
    f.write("\n".join(filtered))

filtered_requirements_path


'requirements.txt'

In [None]:
# #WORKS!!
# database = client.create_database(
#     name="Provenance_MetaData",
#     container_id="6cfb3b8e-1792-4e46-871a-f3d103527203",
#     is_public=True
# )
# print(f"✅ Database created: {database.id}")
# # resp = requests.get("http://localhost/api/container", auth=("reema", "Toothless!26"))
# # print(resp.json())


Table creations

In [35]:
# from dbrepo.api.dto import ColumnType

# print(list(ColumnType))


[<ColumnType.CHAR: 'char'>, <ColumnType.VARCHAR: 'varchar'>, <ColumnType.BINARY: 'binary'>, <ColumnType.VARBINARY: 'varbinary'>, <ColumnType.TINYBLOB: 'tinyblob'>, <ColumnType.TINYTEXT: 'tinytext'>, <ColumnType.TEXT: 'text'>, <ColumnType.BLOB: 'blob'>, <ColumnType.MEDIUMTEXT: 'mediumtext'>, <ColumnType.MEDIUMBLOB: 'mediumblob'>, <ColumnType.LONGTEXT: 'longtext'>, <ColumnType.LONGBLOB: 'longblob'>, <ColumnType.ENUM: 'enum'>, <ColumnType.SERIAL: 'serial'>, <ColumnType.SET: 'set'>, <ColumnType.BIT: 'bit'>, <ColumnType.TINYINT: 'tinyint'>, <ColumnType.BOOL: 'bool'>, <ColumnType.SMALLINT: 'smallint'>, <ColumnType.MEDIUMINT: 'mediumint'>, <ColumnType.INT: 'int'>, <ColumnType.BIGINT: 'bigint'>, <ColumnType.FLOAT: 'float'>, <ColumnType.DOUBLE: 'double'>, <ColumnType.DECIMAL: 'decimal'>, <ColumnType.DATE: 'date'>, <ColumnType.DATETIME: 'datetime'>, <ColumnType.TIMESTAMP: 'timestamp'>, <ColumnType.TIME: 'time'>, <ColumnType.YEAR: 'year'>]


In [47]:
# import requests
# import json

# url = "http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/fbd07137-cc8f-42dc-b587-6be1ecad1001/data"
# auth = ("reema", "Toothless!26")
# headers = {"Content-Type": "application/json"}

# rows = [
#     {
#         "runID": "run006",
#         "sessionID": "sess006",
#         "modelId": "model006",
#         "datasetID": "data006",
#         "git_commit": "abc123",
#         "invenioID": "inv006",
#         "timestamp": "2025-05-18T17:00:00Z"
#     }
# ]

# response = requests.post(url, auth=auth, headers=headers, json=rows)

# if response.status_code == 200:
#     print("✅ Insert successful")
# else:
#     print("❌ Failed")
#     print(response.status_code)
#     print(response.text)


❌ Failed
400
{"type":"about:blank","title":"Bad Request","status":400,"detail":"Failed to read request","instance":"/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/fbd07137-cc8f-42dc-b587-6be1ecad1001/data","properties":null}


In [49]:
# # API endpoint URL
# db_id="ce4550bd-0fad-4a6b-894b-455a1decae5d"
# selected_table_id="fbd07137-cc8f-42dc-b587-6be1ecad1001"
# API_URL = f"http://localhost/api/database/{db_id}/table/{selected_table_id}/data?size=100000&page=0"

# # Define the headers
# headers = {
#     "Accept": "application/json"  # Specify the expected response format
# }

# try:
#     # Send a GET request to the API with the Accept header
#     response = requests.get(API_URL, headers=headers)

#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the JSON response
#         dataset = response.json()
        
        
#         print( dataset)
#     else:
#         print(f"Error: Received status code {response.status_code}")
#         print("Response content:", response.text)
       

# except requests.exceptions.RequestException as e:
#     print(f"Request failed: {e}")


[{'runid': 'run001', 'sessionid': 'sess001', 'modelid': 'model001', 'datasetid': 'data001', 'git_commit': 'abc1234', 'invenioid': 'inv001', 'timestamp': '2025-05-18 14:30:00.0'}, {'runid': 'run002', 'sessionid': 'sess002', 'modelid': 'model002', 'datasetid': 'data002', 'git_commit': 'def5678', 'invenioid': 'inv002', 'timestamp': '2025-05-18 15:00:00.0'}, {'runid': 'run003', 'sessionid': 'sess003', 'modelid': 'model003', 'datasetid': 'data003', 'git_commit': 'ghi9012', 'invenioid': 'inv003', 'timestamp': '2025-05-18 15:30:00.0'}, {'runid': 'run004', 'sessionid': 'sess004', 'modelid': 'model004', 'datasetid': 'data004', 'git_commit': 'jkl3456', 'invenioid': 'inv004', 'timestamp': '2025-05-18 16:00:00.0'}, {'runid': 'run005', 'sessionid': 'sess005', 'modelid': 'model005', 'datasetid': 'data005', 'git_commit': 'mno7890', 'invenioid': 'inv005', 'timestamp': '2025-05-18 16:30:00.0'}]


In [67]:
# import requests
# import json

# url = "http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/53ffac33-9391-4cba-bd7b-5759a1c98201/data"

# headers = {
#     "Content-Type": "application/json"
# }

# auth = ("reema", "Toothless!26")  # Replace with your actual credentials

# # Payload matching the JSON format that worked in Postman
# payload = {
#     "data": {
#         "runid": "run0010000",
#         "sessionid": "sess009",
#         "modelid": "model009",
#         "datasetid": "data009",
#         "git_commit": "xyz999",
#         "invenioid": "inv009",
#         "timestamp": "2025-05-18T11:31:31.914+00:00"
#     }
# }

# response = requests.post(url, headers=headers, auth=auth, json=payload)

# if response.status_code == 201:
#     print("✅ Row inserted successfully!")
# else:
#     print("❌ Failed to insert")
#     print("Status:", response.status_code)
#     print(response.text)


✅ Row inserted successfully!


In [64]:
formatted

'2025-05-18T11:32:27.292+00:00'

In [70]:
# import requests
# from datetime import datetime

# # Auth & headers
# headers = {"Content-Type": "application/json"}
# auth = ("reema", "Toothless!26")  # ⚠️ Never hardcode this in prod

# # Timestamp conversion
# def to_mysql_datetime(ts):
#     return datetime.strptime(ts, "%Y-%m-%d %H:%M:%S").isoformat() + "+00:00"

# # Define base URL (adjust table IDs per table)
# BASE = "http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table"

# # Define table-specific endpoints
# TABLES = {
#     "session_metadata": "3af934ed-a467-46e5-bb0a-495b2ff0efbf",
#     "experiment_metadata": "53ffac33-9391-4cba-bd7b-5759a1c98201",
#     "git_metadata": "ad546581-467e-4570-94eb-45eeb4f3019b",
#     "dataset_metadata": "9119bded-19af-42b6-b27e-c9d229a9a7c2",
#     "model_metadata": "26294b1f-4f4d-4a7a-917e-141fa048fb39",
# }

# # 1. Session Metadata
# requests.post(
#     f"{BASE}/{TABLES['session_metadata']}/data",
#     headers=headers,
#     auth=auth,
#     json={
#         "data": {
#             "session_id": "sess_001pppp",
#             "username": "user123",
#             "timestamp": to_mysql_datetime("2025-05-18 09:30:00"),
#             "hostname": "host001",
#             "platform": "Linux"
#         }
#     }
# )

# # 2. Experiment Metadata
# requests.post(
#     f"{BASE}/{TABLES['experiment_metadata']}/data",
#     headers=headers,
#     auth=auth,
#     json={
#         "data": {
#             "runid": "run_001xppppp",
#             "sessionid": "sess_001",
#             "modelid": "model_001",
#             "datasetid": "ds_001",
#             "git_commit": "abc1234",
#             "invenioid": "inv_001",
#             "timestamp": to_mysql_datetime("2025-05-18 09:45:00")
#         }
#     }
# )

# # 3. Git Metadata
# requests.post(
#     f"{BASE}/{TABLES['git_metadata']}/data",
#     headers=headers,
#     auth=auth,
#     json={
#         "data": {
#             "commit_hash": "abc1234xppppp",
#             "repo_url": "https://github.com/example/repo",
#             "branch": "main",
#             "author": "dev_user",
#             "timestamp": to_mysql_datetime("2025-05-18 09:20:00"),
#             "version": "v2.3.1"
#         }
#     }
# )

# # 4. Dataset Metadata
# requests.post(
#     f"{BASE}/{TABLES['dataset_metadata']}/data",
#     headers=headers,
#     auth=auth,
#     json={
#         "data": {
#             "dataset_id": "ds_001xpppppp",
#             "table_name": "table_xyz",
#             "detailed_type": "CSV",
#             "classes": 3,
#             "features": 5,
#             "output_type": "categorical",
#             "version": "v1.0"
#         }
#     }
# )

# # 5. Model Metadata
# requests.post(
#     f"{BASE}/{TABLES['model_metadata']}/data",
#     headers=headers,
#     auth=auth,
#     json={
#         "data": {
#             "model_id": "model_001xppppppppp",
#             "name": "RandomForest",
#             "algo": "RandomForestClassifier",
#             "features": '["age", "income", "education", "gender", "marital_status"]',
#             "label_snap": "income_group",
#             "train_split": 0.8,
#             "test_split": 0.2,
#             "target_var": "income_group",
#             "label_map": '{"0":"Low","1":"Medium","2":"High"}',
#             "feature_select": '["age", "income", "education"]',
#             "imbalance_ratio": 1.5
#         }
#     }
# )



<Response [201]>

In [52]:
from dbrepo.RestClient import RestClient
client = RestClient(endpoint="http://localhost", username="reema", password="Toothless!26")
print(dir(client))


['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_upload', '_wrapper', 'analyse_datatypes', 'analyse_keys', 'analyse_table_statistics', 'create_container', 'create_database', 'create_database_access', 'create_identifier', 'create_subset', 'create_table', 'create_table_data', 'create_view', 'delete_container', 'delete_database_access', 'delete_table', 'delete_table_data', 'delete_view', 'endpoint', 'get_concepts', 'get_container', 'get_containers', 'get_database', 'get_database_access', 'get_databases', 'get_databases_count', 'get_identifier', 'get_identifier_data', 'get_identifiers', 'get_image', 'get_images', 'get_licenses', 'get_messages', 'get_ontologies', 

In [30]:
import json
import requests
from datetime import datetime

# --- Configuration ---
headers = {"Content-Type": "application/json"}
auth = ("reema", "Toothless!26")
BASE = "http://localhost/api/database/633f5987-d116-42e8-97fc-36b9c25ade24/table"
TABLES = {
    "session_metadata": "a4b15637-b98c-41f1-910d-cedfe80ca53b",
    "experiment_metadata": "0bf9cd0c-f0c0-4236-a412-30265f5ee1a6",
    "git_metadata": "e3bb40a4-484c-4148-99a9-8189d02afacd",
    "dataset_metadata": "78bce105-fe1b-4d02-a7af-68885030a15d",
    "model_metadata": "99391154-5d55-4d28-9843-194508cd0c7e"
}

# --- Load metadata file ---
# with open("MODEL_PROVENANCE/b788db5d12174c28bc175589898f7f95/RandomForest_Iris_v20250516_193049_run_summary.json", "r") as f:
with open(summary_local_path, "r") as f:

    meta = json.load(f)

# --- Helper ---
def to_mysql_datetime(ts):
    return datetime.strptime(ts.split(".")[0], "%Y-%m-%dT%H:%M:%S").isoformat() + "+00:00"

# --- Extract shared values ---
run_id = meta["run_id"]
session_id = meta["params"]["session_id"]
dataset_id = meta["tags"]["dataset_id"]
model_id = "model_" + meta["tags"]["model_name"].split("_")[1].lower()
git_commit = meta["tags"]["git_commit"]
git_version = meta["tags"]["GIT_code_version"]
timestamp_utc = meta["params"]["timestamp_utc"]
username = meta["params"]["username"]
platform = meta["params"]["platform"]
hostname = meta["params"]["hostname"]
target_var = meta["tags"]["target_variable"]
label_map = meta["tags"]["label_map"]
feature_list = meta["params"]["final_feature_names"]
dataset_name = meta["tags"]["dataset_name"]
dataset_version = meta["tags"]["dataset_version"]
estimator = meta["tags"]["estimator_name"]
feature_select = meta["tags"]["feature_columns"]
label_snap = meta["tags"]["target_variable_encoded"]
model_name = meta["tags"]["model_name"]
imbalance_ratio = 1.0 if "imbalance_ratio" not in meta["tags"] else meta["tags"]["imbalance_ratio"]

# # --- 1. Session Metadata ---
# session_payload = {
#     "session_id": session_id,
#     "username": username,
#     "timestamp": to_mysql_datetime(timestamp_utc),
#     "hostname": hostname,
#     "platform": platform
# }

# session_url = f"{BASE}/{TABLES['session_metadata']}/data"

# response = requests.post(
#     session_url,
#     headers=headers,
#     auth=auth,
#     json={"data": session_payload}
# )

# # --- Logging ---
# print("\n🔍 Session Metadata POST")
# print("➡️ URL:", session_url)
# print("📦 Payload:")
# print(json.dumps(session_payload, indent=2))
# print("📤 Status Code:", response.status_code)
# print("📝 Response Text:", response.text)

# # # --- 2. Experiment Metadata ---
# exp_payload = {
#     "runid": run_id,
#     "sessionid": session_id,
#     "modelid": model_id,
#     "datasetid": dataset_id,
#     "git_commit": git_commit,
#     "invenioid": meta["tags"].get("DOI_dataset_id"),
#     "timestamp": to_mysql_datetime(timestamp_utc)
# }

# exp_url = f"{BASE}/{TABLES['experiment_metadata']}/data"

# response = requests.post(
#     exp_url,
#     headers=headers,
#     auth=auth,
#     json={"data": exp_payload}
# )

# # --- Logging ---
# print("\n🔍 Experiment Metadata POST")
# print("➡️ URL:", exp_url)
# print("📦 Payload:")
# print(json.dumps(exp_payload, indent=2))
# print("📤 Status Code:", response.status_code)
# print("📝 Response Text:", response.text)


# --- 3. Git Metadata ---
git_payload = {
    "commit_hash": git_commit,
    "repo_url": meta["tags"]["DOI_prov_used"],
    "branch": "main",  # assumed
    "author": meta["tags"]["GIT_user"],
    "timestamp": to_mysql_datetime(timestamp_utc),
    "version": git_version
}

git_url = f"{BASE}/{TABLES['git_metadata']}/data"

response = requests.post(
    git_url,
    headers=headers,
    auth=auth,
    json={"data": git_payload}
)

# --- Logging ---
print("\n🔍 Git Metadata POST")
print("➡️ URL:", git_url)
print("📦 Payload:")
print(json.dumps(git_payload, indent=2))
print("📤 Status Code:", response.status_code)
print("📝 Response Text:", response.text)

# --- Build payload dynamically ---
dataset_payload = {
    "dataset_id": dataset_id,
    "table_name": dataset_name,
    "detailed_type": "CSV",
    "classes": 3,
    "features": int(meta["params"]["final_num_features"]),
    "output_type": "categorical",
    "version": dataset_version
}

# --- Build URL ---
dataset_url = f"{BASE}/{TABLES['dataset_metadata']}/data"

# --- Send POST request ---
response = requests.post(
    dataset_url,
    headers=headers,
    auth=auth,
    json={"data": dataset_payload}
)

# --- Log request/response ---
print("\n🔍 Dataset Metadata POST")
print("➡️ URL:", dataset_url)
print("📦 Payload:")
print(json.dumps(dataset_payload, indent=2))
print("📤 Status Code:", response.status_code)
print("📝 Response Text:", response.text)


# -- Log POST request for model metadata --
model_payload = {
    "model_id": model_id,
    "name": model_name,
    "algo": estimator,
    "features": feature_list,
    "label_snap": label_snap,
    "train_split": float(meta["params"]["n_train_samples"]) / int(meta["params"]["input_row_count"]),
    "test_split": float(meta["params"]["n_test_samples"]) / int(meta["params"]["input_row_count"]),
    "target_var": target_var,
    "label_map": label_map,
    "feature_select": feature_select,
    "imbalance_ratio": imbalance_ratio
}

model_url = f"{BASE}/{TABLES['model_metadata']}/data"
response = requests.post(
    model_url,
    headers=headers,
    auth=auth,
    json={"data": model_payload}
)

# --- Logging ---
print("\n🔍 MODEL METADATA POST")
print("➡️ URL:", model_url)
print("📦 Payload:")
print(json.dumps(model_payload, indent=2))
print("📤 Status Code:", response.status_code)
print("📝 Response Text:", response.text)




🔍 Git Metadata POST
➡️ URL: http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/ad546581-467e-4570-94eb-45eeb4f3019b/data
📦 Payload:
{
  "commit_hash": "edb72cb2613a3001b07b1ae6e0d18d4a1023fd1e",
  "repo_url": "https://archive.ics.uci.edu/dataset/53",
  "branch": "main",
  "author": "Reema George",
  "timestamp": "2025-05-18T15:03:47+00:00",
  "version": "v99"
}
📤 Status Code: 201
📝 Response Text: 

🔍 Dataset Metadata POST
➡️ URL: http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/9119bded-19af-42b6-b27e-c9d229a9a7c2/data
📦 Payload:
{
  "dataset_id": "0c672781-25c3-438e-8fba-18c2c7f16886",
  "table_name": "iris_data_v3",
  "detailed_type": "CSV",
  "classes": 3,
  "features": 4,
  "output_type": "categorical",
  "version": "v3"
}
📤 Status Code: 201
📝 Response Text: 

🔍 MODEL METADATA POST
➡️ URL: http://localhost/api/database/ce4550bd-0fad-4a6b-894b-455a1decae5d/table/26294b1f-4f4d-4a7a-917e-141fa048fb39/data
📦 Payload:
{
  "model_id": "model

1. Standards-compliant export (JSON-LD + Turtle)
I already have your plain run_summary.json , wrap it in a JSON-LD context that maps your fields into PROV-O terms, then use rdflib to emit Turtle:

In [None]:
import json
import pandas as pd

# Load the JSON file
json_path = "/mnt/data/REPO/notebooks/RQ_notebooks/MODEL_PROVENANCE/RandomForest_Iris_v20250425_125653/RandomForest_Iris_v20250425_125653_run_summary.json"
with open(json_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract justification tags
justifications = {
    k: v for k, v in data.get("tags", {}).items()
    if k.startswith("justification_")
}

# Create a DataFrame
justification_df = pd.DataFrame([
    {"Decision": k.replace("justification_", ""), "Justification": v}
    for k, v in justifications.items()
])

import ace_tools as tools; tools.display_dataframe_to_user(name="Researcher Justifications", dataframe=justification_df)


In [None]:

def iso8601(ms):
    """Convert milliseconds since epoch to ISO8601 UTC."""
    return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()

for json_path in glob.glob("MODEL_PROVENANCE/*/*_run_summary.json"):
    basename   = os.path.basename(json_path)
    model_name = basename.rsplit("_run_summary.json", 1)[0]

    with open(json_path, "r", encoding="utf-8") as f:
        summary = json.load(f)

    #–– Minimal override context: keep all your flat fields as-is,
    #–– and only map the actual PROV terms to their IRIs.
    ctx = {
        # keep these flat
        "run_id":       { "@id": "run_id" },
        "run_name":     { "@id": "run_name" },
        "experiment_id":{ "@id": "experiment_id" },
        "params":       { "@id": "params" },
        "metrics":      { "@id": "metrics" },
        "artifacts":    { "@id": "artifacts" },
        "tags":         { "@id": "tags" },

        # provenance namespace
        "prov": "http://www.w3.org/ns/prov#",
        "xsd":  "http://www.w3.org/2001/XMLSchema#",

        # map your timestamp fields into PROV
        "start_time": { "@id": "prov:startedAtTime", "@type": "xsd:dateTime" },
        "end_time":   { "@id": "prov:endedAtTime",   "@type": "xsd:dateTime" },

        # PROV-used/generated
        "used":      { "@id": "prov:used",      "@type": "@id" },
        "generated": { "@id": "prov:generated", "@type": "@id" },

        # JSON-LD boilerplate
        "@id":   "@id",
        "@type": "@type"
    }

    #–– Build JSON-LD document, re-using your original keys verbatim
    doc = {
        "@context":      ctx,
        "run_id":        summary["run_id"],
        "run_name":      summary.get("run_name"),
        "experiment_id": summary.get("experiment_id"),
        "params":        summary.get("params", {}),
        "metrics":       summary.get("metrics", {}),
        "artifacts":     summary.get("artifacts", []),
        "tags":          summary.get("tags", {}),

        # PROV fields:
        "start_time": iso8601(summary["start_time"])
    }

    if summary.get("end_time") is not None:
        doc["end_time"] = iso8601(summary["end_time"])

    # for used/generated, just point at your dataset/model URIs
    # (or blank-node them if you prefer richer structure)
    doc["used"] = summary.get("tags", {}).get("dataset_uri") or []
    doc["generated"] = [
        art.get("uri") or art.get("path")
        for art in summary.get("artifacts", [])
    ]

    #–– write JSON-LD
    out_jsonld = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.jsonld")
    with open(out_jsonld, "w", encoding="utf-8") as f:
        json.dump(doc, f, indent=2)

    #–– parse & serialize to Turtle
    g = Graph().parse(data=json.dumps(doc), format="json-ld")
    out_ttl = os.path.join("MODEL_PROVENANCE", model_name, f"{model_name}.ttl")
    g.serialize(destination=out_ttl, format="turtle")

    print(f"Converted {basename} → {os.path.basename(out_jsonld)}, {os.path.basename(out_ttl)}")



This code programatically, finds diff between generated Json file and created JsonLD and .TTL file to make it easier to understand if there is any discrepency

In [None]:


def load_as_dict(path):
    if path.endswith((".ttl", ".turtle")):
        g = Graph()
        g.parse(path, format="turtle")
        # normalize to JSON-LD dict
        return json.loads(g.serialize(format="json-ld", indent=2))
    else:
        with open(path, encoding="utf-8") as f:
            return json.load(f)

def compare_json(a, b, path=""):
    diffs = []
    if isinstance(a, dict) and isinstance(b, dict):
        all_keys = set(a) | set(b)
        for k in all_keys:
            new_path = f"{path}/{k}" if path else k
            if k not in a:
                diffs.append({"path": new_path, "type": "added",   "a": None,    "b": b[k]})
            elif k not in b:
                diffs.append({"path": new_path, "type": "removed", "a": a[k],   "b": None})
            else:
                diffs.extend(compare_json(a[k], b[k], new_path))
    elif isinstance(a, list) and isinstance(b, list):
        for i, (ia, ib) in enumerate(zip(a, b)):
            diffs.extend(compare_json(ia, ib, f"{path}[{i}]"))
        # handle length mismatches
        if len(a) < len(b):
            for i in range(len(a), len(b)):
                diffs.append({"path": f"{path}[{i}]", "type": "added",   "a": None,  "b": b[i]})
        elif len(a) > len(b):
            for i in range(len(b), len(a)):
                diffs.append({"path": f"{path}[{i}]", "type": "removed", "a": a[i],  "b": None})
    else:
        if a != b:
            diffs.append({"path": path, "type": "changed", "a": a, "b": b})
    return diffs


# Define base directory
base_dir = os.path.join("MODEL_PROVENANCE", model_name)

# Build full paths for the files to compare
summary_json    = os.path.join(base_dir, f"{model_name}_run_summary.json")
turtle_file     = os.path.join(base_dir, f"{model_name}.ttl")
jsonld_file     = os.path.join(base_dir, f"{model_name}.jsonld")

# Load files
a = load_as_dict(summary_json)
b = load_as_dict(turtle_file)
c = load_as_dict(summary_json)
d = load_as_dict(jsonld_file)

# Perform comparisons
diffs_jsonld_vs_ttl = compare_json(a, b)
diffs_json_vs_jsonld = compare_json(c, d)

# Build DataFrames for interactive inspection
df1 = pd.DataFrame(diffs_jsonld_vs_ttl)
df2 = pd.DataFrame(diffs_json_vs_jsonld)

# --- Summaries & Filtering ---------------------------------------

def summarize_and_preview(df, preview_n=10):
    print("Change summary:")
    print(df['type'].value_counts().to_string(), "\n")
    
    print(f"First {preview_n} ‘changed’ entries:")
    # print(df[df['type']=="changed"].head(preview_n).to_string(index=False), "\n")
    
    # Top‐level (one slash) adds/removes
    top = df[df['path'].str.count("/") == 1]
    print("Top-level adds/removes:")
    print(top[top['type'].isin(['added','removed'])].to_string(index=False))

print("== JSON-LD vs TTL ==")
summarize_and_preview(df1)

print("\n== JSON vs JSON-LD ==")
summarize_and_preview(df2)



In [None]:
# show all the removed paths (in JSON but not in JSON-LD)
print("Removed in JSON-LD comparison:")
print(df2[df2['type']=="removed"][['path']].to_string(index=False))

# show all the added paths (in JSON-LD but not in JSON)
print("\nAdded in JSON-LD comparison:")
print(df2[df2['type']=="added"][['path']].to_string(index=False))

In [None]:
# show all the removed paths (in JSON but not in JSON-LD)
print("Removed in .ttl comparison:")
print(df1[df1['type']=="removed"][['path']].to_string(index=False))

# show all the added paths (in JSON-LD but not in JSON)
print("\nAdded in .ttl comparison:")
print(df1[df1['type']=="added"][['path']].to_string(index=False))

Checks for completeness and mapping and time taken, needs work #TODO

In [None]:

# ── User configuration ─────────────────────────────────────────────────────────

# Which keys must appear in every run_summary.json?
REQUIRED_TOPLEVEL = {
    "run_id", "start_time", "end_time",
    "params", "metrics", "tags", "artifacts"
}

# A couple of sub-fields we also want to spot-check:
REQUIRED_PARAMS  = {"random_state"}
REQUIRED_METRICS = {"accuracy"}

JSON_SUMMARIES = glob.glob("MODEL_PROVENANCE/*_run_summary.json")


# ── Helpers ────────────────────────────────────────────────────────────────────

def iso8601(ms):
    return datetime.fromtimestamp(ms/1000, tz=timezone.utc).isoformat()


def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def write_json(path, obj):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)


def convert_to_jsonld_and_ttl(summary, basename):
    # build @context
    ctx = {
        "prov":    "http://www.w3.org/ns/prov#",
        "xsd":     "http://www.w3.org/2001/XMLSchema#",
        "run":     "prov:Activity",
        "start":   "prov:startedAtTime",
        "end":     "prov:endedAtTime",
        "used":    "prov:used",
        "gen":     "prov:generated",
        "param":   "prov:hadParameter",
        "metric":  "prov:hadQuality",
        "entity":  "prov:Entity",
        "label":   "prov:label",
        "value":   "prov:value",
        "version": "prov:hadRevision",
        "id":      "@id",
        "type":    "@type"
    }

    jsonld = {
        "@context": ctx,
        "@id":      f"urn:run:{summary['run_id']}",
        "@type":    "run",
        "start": {
            "@type":  "xsd:dateTime",
            "@value": iso8601(summary["start_time"])
        }
    }
    if summary.get("end_time") is not None:
        jsonld["end"] = {
            "@type":  "xsd:dateTime",
            "@value": iso8601(summary["end_time"])
        }

    # params
    jsonld["param"] = [
        {"@type":"entity","label":k,"value":str(v)}
        for k,v in summary.get("params",{}).items()
    ]
    # metrics
    jsonld["metric"] = [
        {"@type":"entity","label":k,
         "value":{"@type":"xsd:decimal","@value":v}}
        for k,v in summary.get("metrics",{}).items()
    ]
    # artifacts
    jsonld["gen"] = [
        {
            "@type":"entity",
            "label": art.get("path") or art.get("label"),
            "prov:location": (
                art.get("uri")
                or (art.get("content","")[:30]+"…")
                if isinstance(art.get("content"),str)
                else ""
            )
        }
        for art in summary.get("artifacts",[])
    ]
    # dataset used
    jsonld["used"] = {
        "@type":"entity",
        "label": summary["tags"].get("dataset_name"),
        "version": summary["tags"].get("dataset_version")
    }

    # write JSON-LD
    out_jsonld = f"MODEL_PROVENANCE/{basename}.jsonld"
    write_json(out_jsonld, jsonld)

    # serialize TTL
    g = Graph().parse(data=json.dumps(jsonld), format="json-ld")
    out_ttl = f"MODEL_PROVENANCE/{basename}.ttl"
    g.serialize(destination=out_ttl, format="turtle")

    return out_jsonld, out_ttl


def normalize_jsonld(js):
    """Simple deep-sort so compare_json doesn’t trip over ordering."""
    if isinstance(js, dict):
        return {k: normalize_jsonld(js[k]) for k in sorted(js)}
    if isinstance(js, list):
        return sorted((normalize_jsonld(el) for el in js),
                      key=lambda x: json.dumps(x, sort_keys=True))
    return js


def diff_roundtrip(orig_json, jsonld_path, ttl_path):
    orig = load_json(orig_json)
    ld   = load_json(jsonld_path)

    # parse TTL back to JSON-LD
    g = Graph().parse(ttl_path, format="turtle")
    ttl_as_ld = json.loads(g.serialize(format="json-ld"))

    # normalize
    nl = normalize_jsonld(ld)
    nt = normalize_jsonld(ttl_as_ld)

    return {
        "orig_vs_jsonld":   compare_json(orig, ld),
        "jsonld_vs_ttl_ld": compare_json(nl, nt)
    }


# ── Main flow ─────────────────────────────────────────────────────────────────

def main():
    ok = 0
    total = len(JSON_SUMMARIES)
    missing_reports = []
    cases = {}  # store diff results per run

    for js_path in JSON_SUMMARIES:
        summary = load_json(js_path)
        base    = os.path.basename(js_path).split("_run_summary.json")[0]

        # 1) completeness check
        if not REQUIRED_TOPLEVEL.issubset(summary):
            missing = REQUIRED_TOPLEVEL - set(summary)
            missing_reports.append((js_path, f"missing fields {missing}"))
            continue

        if not (REQUIRED_PARAMS <= summary["params"].keys()):
            missing_reports.append((js_path, f"params missing {REQUIRED_PARAMS - summary['params'].keys()}"))
            continue

        if not (REQUIRED_METRICS <= summary["metrics"].keys()):
            missing_reports.append((js_path, f"metrics missing {REQUIRED_METRICS - summary['metrics'].keys()}"))
            continue

        ok += 1

        # 2) convert
        jsonld_path, ttl_path = convert_to_jsonld_and_ttl(summary, base)

        # 3) diff
        diffs = diff_roundtrip(js_path, jsonld_path, ttl_path)
        cases[base] = diffs
        print(f"\n── {base} diffs ──")
        print("  • JSON → JSON-LD:", len(diffs["orig_vs_jsonld"]), "differences")
        print("  • JSON-LD → TTL → JSON-LD:", len(diffs["jsonld_vs_ttl_ld"]), "differences")

    # 4) completeness summary
    completeness_pct = (100 * ok / total) if total else 0
    print(f"\n{ok}/{total} runs passed completeness checks ({completeness_pct:.1f}%).")
    if missing_reports:
        print("\nFailures:")
        for path, reason in missing_reports:
            print(f" • {path}: {reason}")

    # 5) integrity check
    total_runs = len(cases)
    zero_diff_runs = sum(
        1
        for diffs in cases.values()
        if not diffs["orig_vs_jsonld"] and not diffs["jsonld_vs_ttl_ld"]
    )
    integrity_pct = (100 * zero_diff_runs / total_runs) if total_runs else 0
    print(f"\nMapping integrity: {zero_diff_runs}/{total_runs} runs have zero diffs — {integrity_pct:.1f}%")

    # 6) overall quality score
    overall_score = (completeness_pct + integrity_pct) / 2
    print(f"Overall quality score: {overall_score:.1f}%")

    # 7) Benchmark your training fn
    print("\nBenchmarking train_and_log() overhead:")
    def train_and_log(use_mlflow=False):
        # ← your real instrumentation + fit logic here
        time.sleep(0.5 + (0.1 if use_mlflow else 0))  # stub
        return

    for flag in (False, True):
        start = time.time()
        train_and_log(use_mlflow=flag)
        elapsed = time.time() - start
        label = "With MLflow" if flag else "No MLflow"
        print(f"  • {label:10s}: {elapsed:.3f}s")


if __name__ == "__main__":
    main()

RQ2  implementation

In [None]:

# Load all run summary JSON files
files = glob.glob("MODEL_PROVENANCE/*/*_run_summary.json")
rows = []
for f in files:
    with open(f) as fh:
        summary = json.load(fh)
    # Flatten parameters and metrics
    row = {"run_id": summary["run_id"]}
    row.update({f"param_{k}": v for k, v in summary.get("params", {}).items()})
    row.update({f"metric_{k}": v for k, v in summary.get("metrics", {}).items()})
    row.update({f"tag_{k}": v for k, v in summary.get("tags", {}).items()})
    rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)

# Display the DataFrame
df.columns


1) Tracing preprocessing steps
:
Here are the top 4 Iris‐focused preprocessing‐tracing use cases I’d tackle first:

Reconstruct a run’s exact preprocessing
Fetch a run’s run_id, columns_raw, dropped_columns, feature_names and test_size so you can replay the exact data pull & split.

Feature‐drop impact analysis
Identify runs where one or more measurements (e.g. petalwidthcm) were dropped and compare their test accuracies.

Best feature subset discovery
Group runs by which features they used (sepals only vs petals only vs both) and rank them by test F1 or accuracy.

Common steps in high-accuracy runs
Filter for runs with accuracy_score_X_test ≥ 0.95 and list the shared preprocessing settings (dropped columns, test_size, etc.).

In [None]:


# # Helper to get the “official” feature_names from your summary DF
# def _get_all_features(df):
#     # assumes every row has the same param_feature_names
#     raw = df.loc[0, 'param_feature_names']
#     return ast.literal_eval(raw)

# # Train & eval RF on just these columns of Iris
# def evaluate_subset(features, test_size=0.2, random_state=42, n_estimators=200):
#     iris = load_iris()
#     X = pd.DataFrame(iris.data, columns=iris.feature_names)
#     # map sklearn’s names to your param names, e.g. "sepal length (cm)" → "sepallengthcm"
#     canon = _get_all_features(df)
#     mapping = dict(zip(iris.feature_names, canon))
#     X = X.rename(columns=mapping)
#     X_sub = X[features]
#     y = iris.target
#     Xtr, Xte, ytr, yte = train_test_split(X_sub, y, test_size=test_size, random_state=random_state)
#     m = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
#     m.fit(Xtr, ytr)
#     return accuracy_score(yte, m.predict(Xte))
# def trace_preprocessing(df, run_id=None):
#     cols = ['run_id',
#             'param_dataset.title',
#             'param_columns_raw',
#             'param_dropped_columns',
#             'param_feature_names',
#             'param_dataset.authors', 'param_dataset.doi', 'param_dataset.published',
#             'param_test_size',
#             'param_criterion',
#             'param_max_depth','param_max_leaf_nodes', 'param_max_samples',
#            'metric_accuracy','metric_f1_macro','metric_roc_auc']
#     if run_id is None:
#         subset = df.loc[:, cols]
#     else:
#         subset = df.loc[df['run_id'] == run_id, cols]
#     return subset.to_dict(orient='records')


# def drop_impact(df, feature, **_):
#     all_feats = _get_all_features(df)
#     baseline = evaluate_subset(all_feats)
#     without = [f for f in all_feats if f!=feature]
#     dropped = evaluate_subset(without)
#     return {
#       'dropped_feature': feature,
#       'baseline_acc': baseline,
#       'dropped_acc': dropped,
#       'impact': baseline - dropped
#     }

# def drop_impact_all(df: pd.DataFrame) -> List[Dict[str, Any]]:
#     """
#     Compute drop-impact for every feature in the dataset.
#     Returns list of dicts with dropped_feature, baseline_acc, dropped_acc, impact.
#     """
#     feats = _get_all_features(df)
#     baseline = evaluate_subset(feats)
#     summary = []
#     for feat in feats:
#         without = [f for f in feats if f != feat]
#         acc = evaluate_subset(without)
#         summary.append({
#             'dropped_feature': feat,
#             'baseline_acc': baseline,
#             'dropped_acc': acc,
#             'impact': round(baseline - acc, 4)
#         })
#     return summary

# def best_feature_subset(df, features, **_):
#     acc = evaluate_subset(features)
#     return {'features': features, 'accuracy': acc}

# def common_high_accuracy(df: pd.DataFrame, threshold: float = 0.95) -> List[Dict[str, Any]]:
#     """
#     Filter runs with test accuracy >= threshold and list unique shared preprocessing settings.
#     """
#     high = df[df['metric_accuracy_score_X_test'] >= threshold]
#     cols = ['param_dropped_columns', 'param_test_size', 'param_feature_names']
#     return high[cols].drop_duplicates().to_dict(orient='records')


# # --------------------------------------------
# # Use Case Registry with parameter order for minimal input
# # --------------------------------------------
# USE_CASES = {
#     'trace_preprocessing': {
#         'func': trace_preprocessing,
#         'required_params': [],            # none strictly required
#         'optional_params': ['run_id'],    # run_id can be supplied or not
#     },
#     'drop_impact': {
#         'func': drop_impact,
#         'required_params': ['feature'],
#         'optional_params': [],
#     },
#      'drop_impact_all': {
#         'func': drop_impact_all,
#         'required_params': [],
#         'optional_params': [],
#     },
#     'best_feature_subset': {
#         'func': best_feature_subset,
#         'required_params': ['features'],
#         'optional_params': [],
#     },
#     'common_high_accuracy': {
#         'func': common_high_accuracy,
#         'required_params': ['threshold'],
#         'optional_params': [],
#     },
# }


# def call_use_case(df, use_case_name, **kwargs):
#     if use_case_name not in USE_CASES:
#         raise ValueError(f"Unknown use case: {use_case_name}")
#     case = USE_CASES[use_case_name]
#     func = case['func']
#     # check required
#     missing = [p for p in case['required_params'] if p not in kwargs]
#     if missing:
#         raise ValueError(f"{use_case_name} missing required params: {missing}")
#     # build args
#     args = {p: kwargs[p] for p in case['required_params']}
#     for p in case['optional_params']:
#         args[p] = kwargs.get(p)
#     return func(df, **args)

# # --------------------------------------------
# # Example Usage
# # --------------------------------------------
# if __name__ == '__main__':
#    # # 1) trace_preprocessing for all runs
#     print(call_use_case(df, 'trace_preprocessing'))
    
#     # 2) trace_preprocessing for a single run_id
#     print(call_use_case(df, 'trace_preprocessing', run_id='361daa12f99f4129a06cd20b78dd6fa7'))

#     # 5) common_high_accuracy
#     print(call_use_case(df, 'common_high_accuracy', threshold=0.99))

#     # 4) Best‐subset on just sepals:
#     print(call_use_case(df, 'best_feature_subset', features=['sepallengthcm','sepalwidthcm']))

#     # 3) Drop‐impact for “petallengthcm”:
#     print(call_use_case(df, 'drop_impact', feature='petallengthcm'))

#     print(call_use_case(df, 'drop_impact_all'))  # summary for all features


 • Detecting models trained with deprecated code versions
 • Mapping models to specific datasets used during training

In [None]:

def detect_deprecated_code(df: pd.DataFrame, deprecated_commits: List[str], **_) -> List[Dict[str, Any]]:
    # we know the column is called tag_git_current_commit_hash
    commit_col = 'tag_git_current_commit_hash'
    if commit_col not in df.columns:
        raise KeyError(f"Missing {commit_col} in DataFrame")
    out = df[df[commit_col].isin(deprecated_commits)]
    # include run_id and notebook/runName for context
    cols = ['run_id', commit_col, 'tag_notebook_name', 'tag_mlflow.runName']
    # drop any that don’t exist
    cols = [c for c in cols if c in df.columns]
    return out[cols].to_dict(orient='records')


def map_model_dataset(df: pd.DataFrame, **_) -> List[Dict[str, Any]]:
    """
    For each run, return its model name (or run_id) alongside the dataset
    title, DOI, published date and publisher.
    """
    # pick whichever model-name column you have
    model_col = 'tag_model_name' if 'tag_model_name' in df.columns else 'param_model_name'
    cols = [
        'run_id',
        model_col,
        'param_dataset.title',
        'param_dataset.doi',
        'param_dataset.published',
        'param_dataset.publisher'
    ]
    # filter out any columns that don’t actually exist
    cols = [c for c in cols if c in df.columns]
    return df[cols].to_dict(orient='records')

# --------------------------------------------
# Extend Use-Case Registry
# --------------------------------------------
USE_CASES.update({
    'detect_deprecated_code': {
        'func': detect_deprecated_code,
        'required_params': ['deprecated_commits'],
        'optional_params': []
    },
    'map_model_dataset': {
        'func': map_model_dataset,
        'required_params': [],
        'optional_params': []
    },
})
# 1) Detect runs on deprecated commits:
deprecated = [
    "a07434af4f547af2daab044d6873eb7081162293",
    "d329c92495e196ec0f39fbb19dfdd367131a77d9"
]
# print(call_use_case(df, "detect_deprecated_code", deprecated_commits=deprecated))
pprint(call_use_case(df, 'map_model_dataset'))


Goal: Notify collaborators who have forked the GitHub repo if their fork is outdated (i.e., behind the current commit used to train a model).

🧠 What We Need
Current training run’s Git commit hash

GitHub API to fetch all forks of your repo

Compare each fork’s main or master branch head commit

Create an issue on their fork or on your repo tagging them if they’re behind

: Notify via issues on your own repo

In [None]:
def notify_outdated_forks():
    load_dotenv()
    token     = os.getenv("THESIS_TOKEN")
    owner     = "reema-dass26"
    repo      = "REPO"

    if not token:
        print("⚠️ GITHUB_TOKEN not set.")
        return

    headers = {
        "Authorization": f"token {token}",
        "Accept":        "application/vnd.github.v3+json"
    }

    # 1) Get latest upstream commit
    main_commits = requests.get(
        f"https://api.github.com/repos/{owner}/{repo}/commits",
        headers=headers,
        params={"per_page": 1}
    )
    main_commits.raise_for_status()
    new_commit_hash = main_commits.json()[0]["sha"]
    print(f"Latest upstream commit: {new_commit_hash}")

    # 2) List forks
    forks_resp = requests.get(f"https://api.github.com/repos/{owner}/{repo}/forks", headers=headers)
    forks_resp.raise_for_status()
    forks = forks_resp.json()

    # 3) Compare each fork
    outdated = []
    for fork in forks:
        fork_owner = fork["owner"]["login"]
        fork_comm = requests.get(
            fork["url"] + "/commits",
            headers=headers,
            params={"per_page": 1}
        )
        if fork_comm.status_code != 200:
            print(f"  – could not fetch commits for {fork_owner}, skipping.")
            continue

        fork_sha = fork_comm.json()[0]["sha"]
        if fork_sha != new_commit_hash:
            outdated.append(f"@{fork_owner}")

    # 4) Open an issue if any are behind
    if outdated:
        title = "🔔 Notification: Your fork is behind the latest commit"
        body  = (
            f"Hi {' '.join(outdated)},\n\n"
            f"The main repository has been updated to commit `{new_commit_hash}`.\n"
            "Please consider pulling the latest changes to stay in sync.\n\n"
            "Thanks!"
        )
        issues_url = f"https://api.github.com/repos/{owner}/{repo}/issues"
        resp = requests.post(
        issues_url,
        headers=headers,
        json={"title": title, "body": body}
    )

    # DEBUGGING OUTPUT
    print(f"→ POST {issues_url}")
    print("→ Status code:", resp.status_code)
    print("→ Response headers:", resp.headers)
    try:
        data = resp.json()
        print("→ Response JSON:", data)
        print("→ html_url field:", data.get("html_url"))
    except ValueError:
        print("→ No JSON response body; raw text:", resp.text)

if __name__ == "__main__":
    answer = input("Do you want to notify collaborators whose forks are behind? (y/N): ").strip().lower()
    if answer in ("y", "yes"):
        notify_outdated_forks()
    else:
        print("No action taken.")


INVENIO INTEGRETION to upload the necessary files and publish

In [None]:
############################################################################################
# TEST CODE FOR INVENIO INTEGRETION
#############################################################################################




# API_BASE = "https://127.0.0.1:5000"
# TOKEN    = "8LnqJuz3TsBHffnDJ3isPLHYHtRbWrC0M667Nb5haEbnXpWqGbFRyfDApymr"

# # 1) Test read‐scope by listing records (no size param or size=1)
# resp = requests.get(
#     f"{API_BASE}/api/records",
#     headers={"Authorization": f"Bearer {TOKEN}"},
#     verify=False
# )
# print(resp.status_code)
# # should be 200 and a JSON page of records

# # or explicitly:
# resp = requests.get(
#     f"{API_BASE}/api/records?size=1",
#     headers={"Authorization": f"Bearer {TOKEN}"},
#     verify=False
# )
# print(resp.status_code, resp.json())
# #################################################################################################
# API_BASE = "https://127.0.0.1:5000"
# TOKEN    = "8LnqJuz3TsBHffnDJ3isPLHYHtRbWrC0M667Nb5haEbnXpWqGbFRyfDApymr"

# resp = requests.options(
#     f"{API_BASE}/api/records",
#     headers={"Authorization": f"Bearer {TOKEN}"},
#     verify=False
# )
# print("Allowed methods:", resp.headers.get("Allow"))

In [None]:

# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
API_BASE   = "https://127.0.0.1:5000"
TOKEN      = "8LnqJuz3TsBHffnDJ3isPLHYHtRbWrC0M667Nb5haEbnXpWqGbFRyfDApymr"
VERIFY_SSL = False  # only for self‐signed dev

HEADERS_JSON = {
    "Accept":        "application/json",
    "Content-Type":  "application/json",
    "Authorization": f"Bearer {TOKEN}",
}

HEADERS_OCTET = {
    "Content-Type":  "application/octet-stream",
    "Authorization": f"Bearer {TOKEN}",
}

# The folders you want to walk & upload:
TO_UPLOAD = ["Trained_models", "plots", "MODEL_PROVENANCE"]


# -----------------------------------------------------------------------------
# 1) Create draft with ALL required metadata
# -----------------------------------------------------------------------------
def create_draft():
    payload = {
  "metadata": {
    "title":            "RandomForest Iris Model Artifacts",
    "creators": [ {
      "person_or_org": {
        "type":        "personal",
        "given_name":  "Reema",
        "family_name": "Dass"
      }
    } ],
    "publication_date": "2025-04-24",
    "resource_type":    { "id": "software" },
    "access": {
      "record": "public",
      "files":  "public"
    }
  }
}
    r = requests.post(f"{API_BASE}/api/records",
                      headers=HEADERS_JSON,
                      json=payload,
                      verify=VERIFY_SSL)
    r.raise_for_status()
    draft = r.json()
    print("✅ Draft created:", draft["id"])
    return draft["id"], draft["links"]


# -----------------------------------------------------------------------------
# 2) Register, upload and commit a single file
# -----------------------------------------------------------------------------
def upload_and_commit(links, key, path):
    # 2a) register the filename in the draft
    r1 = requests.post(links["files"],
                       headers=HEADERS_JSON,
                       json=[{"key": key}],
                       verify=VERIFY_SSL)
    r1.raise_for_status()
    entry = next(e for e in r1.json()["entries"] if e["key"] == key)
    file_links = entry["links"]

    # 2b) upload the bytes
    with open(path, "rb") as fp:
        r2 = requests.put(file_links["content"],
                          headers=HEADERS_OCTET,
                          data=fp,
                          verify=VERIFY_SSL)
    r2.raise_for_status()

    # 2c) commit the upload
    r3 = requests.post(file_links["commit"],
                       headers=HEADERS_JSON,
                       verify=VERIFY_SSL)
    r3.raise_for_status()
    print(f"  • Uploaded {key}")


# -----------------------------------------------------------------------------
# 3) Walk each folder and upload every file
# -----------------------------------------------------------------------------
def upload_folder(links):
    for folder in TO_UPLOAD:
        if not os.path.isdir(folder):
            print(f"⚠️ Skipping missing folder {folder}")
            continue
        base = os.path.dirname(folder) or folder
        for root, _, files in os.walk(folder):
            for fn in files:
                local = os.path.join(root, fn)
                # create a POSIX‐style key preserving subfolders
                key = os.path.relpath(local, start=base).replace(os.sep, "/")
                upload_and_commit(links, key, local)


# -----------------------------------------------------------------------------
# 4) Publish the draft
# -----------------------------------------------------------------------------
def publish(links):
    r = requests.post(links["publish"],
                      headers=HEADERS_JSON,
                      verify=VERIFY_SSL)
    if not r.ok:
        print("❌ Publish failed:", r.status_code, r.text)
        try: print(r.json())
        except: pass
        r.raise_for_status()
    print("✅ Published:", r.json()["id"])


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    recid, links = create_draft()
    upload_folder(links)
    publish(links)

  


########################################################################
# FETCH metadata from INVENIO
########################################################################

In [None]:
def fetch_metadata(record_id, model_name, api_base, headers, verify_ssl=True):
    """
    Fetch Invenio metadata and save to a file named after the model inside 'Invenio_metadata' folder.
    """
    response = requests.get(f"{API_BASE}/api/records/{record_id}",
                            headers=headers,
                            verify=VERIFY_SSL)
    response.raise_for_status()
    metadata = response.json()

    print("✅ Metadata fetched successfully")

    # Create the folder if it doesn't exist
    os.makedirs("Invenio_metadata", exist_ok=True)

    # Construct path and save
    file_path = os.path.join("Invenio_metadata", f"{model_name}_invenio.json")
    with open(file_path, "w") as f:
        json.dump(metadata, f, indent=4)

    print(f"✅ Metadata saved at {file_path}")
    return file_path
path = fetch_metadata(recid, model_name, api_base=API_BASE, headers=HEADERS_JSON)
print(path)

METADATA EXTRACTION FROM INVENIO and ADD it to main Provenence FILE

In [None]:
# ----------------------------
# Function: Extract metadata
# ----------------------------
def extract_metadata(metadata):
    print("✅ Metadata loaded successfully")
    print("ℹ️ ID:", metadata.get("id", "N/A"))
    print("🔍 Extracting required fields...")

    extracted_data = {
        "invenio_metadata": {
            "id": metadata.get("id", ""),
            "title": metadata.get("metadata", {}).get("title", ""),
            "creator": ", ".join([
                creator["person_or_org"].get("name", "")
                for creator in metadata.get("metadata", {}).get("creators", [])
            ]),
            "publication_date": metadata.get("metadata", {}).get("publication_date", ""),
            "files": [],
            "pids": metadata.get("pids", {}),
            "version_info": metadata.get("versions", {}),
            "status": metadata.get("status", ""),
            "views": metadata.get("stats", {}).get("this_version", {}).get("views", 0),
            "downloads": metadata.get("stats", {}).get("this_version", {}).get("downloads", 0),
        }
    }

    for key, file_info in metadata.get("files", {}).get("entries", {}).items():
        file_detail = {
            "key": key,
            "url": file_info["links"].get("content", ""),
            "size": file_info.get("size", 0),
            "mimetype": file_info.get("mimetype", ""),
            "checksum": file_info.get("checksum", ""),
            "metadata": file_info.get("metadata", {}),
        }
        extracted_data["invenio_metadata"]["files"].append(file_detail)

    return extracted_data


invenio_path = f"Invenio_metadata/{model_name}_invenio.json"
run_summary_path = f"MODEL_PROVENANCE/{model_name}/{model_name}_run_summary.json"

# ----------------------------
# Step 1: Load Invenio metadata
# ----------------------------
with open(invenio_path, "r") as f:
    original_metadata = json.load(f)

# ----------------------------
# Step 2: Extract metadata
# ----------------------------
extracted_metadata = extract_metadata(original_metadata)
print("📤 Extracted Metadata Preview:")
print(json.dumps(extracted_metadata, indent=4)[:1000])  # Preview

# ----------------------------
# Step 3: Load run summary
# ----------------------------
with open(run_summary_path, "r") as f:
    existing_metadata = json.load(f)

# ----------------------------
# Step 4: Merge metadata
# ----------------------------
existing_metadata.update(extracted_metadata)

# ----------------------------
# Step 5: Save updated summary
# ----------------------------
with open(run_summary_path, "w") as f:
    json.dump(existing_metadata, f, indent=4)

print(f"✅ Invenio metadata embedded successfully into: {run_summary_path}")

In [None]:
import subprocess

def get_git_version_info():
    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
        tag = subprocess.check_output(["git", "describe", "--tags", "--exact-match"], stderr=subprocess.DEVNULL).decode().strip()
    except subprocess.CalledProcessError:
        tag = "untagged"
    return commit, tag

commit_hash, version_tag = get_git_version_info()
print("Commit:", commit_hash)
print("Version Tag:", version_tag)
