In [1]:
# RQ1.2: Model Provenance Tracking in Jupyter Notebook using MLflow
# Updated with automatic logging of environment, Git, model config, and FAIR-aligned metadata

# ============================
# ⚙️ Install Dependencies (if needed in Colab)
# ============================
!pip install mlflow scikit-learn pandas numpy matplotlib seaborn shap requests

# ============================
# 📦 Imports
# ============================





In [11]:
import os
import time
import json
import psutil
import platform
import git
import mlflow
import requests
import shap
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix,
    precision_score, recall_score, f1_score, roc_curve
)
from mlflow import MlflowClient

In [12]:
# ============================
# 📂 Setup MLflow
# ============================
project_dir = os.getcwd()
mlflow.set_tracking_uri("mlrunlogs/mlflow.db")
mlflow.set_experiment("RandomForest-Iris-CSV")

<Experiment: artifact_location='file:///C:/Users/reema/OneDrive/Dokumente/Provenance_newREPO/Provenence-Tracking-Thesis-Research/notebooks/RQ_notebooks/mlrunlogs/mlflow.db/511200521763183500', creation_time=1745323635705, experiment_id='511200521763183500', last_update_time=1745323635705, lifecycle_stage='active', name='RandomForest-Iris-CSV', tags={}>

In [13]:
# ============================
# 🔄 Git Commit Hash
# ============================
repo_dir = "C:/Users/reema/OneDrive/Dokumente/Provenance_newREPO/Provenence-Tracking-Thesis-Research"
repo = git.Repo(repo_dir)
commit_hash = repo.head.object.hexsha

In [14]:
# ============================
# 📥 Load Dataset
# ============================
df = pd.read_csv("../../data/Iris.csv")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [15]:
# ============================
# 🧠 MLflow Autologging
# ============================
mlflow.autolog(log_input_examples=True, log_model_signatures=True)

2025/04/22 14:13:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [16]:
GITHUB_TOKEN

'REMOVED_SECRET'

In [17]:
# ============================
# 🚀 Start MLflow Run
# ============================
with mlflow.start_run() as run:
    model_name = f"RandomForest_Iris_v1.0.0"
    training_time_start = time.time()

    # 📈 Model Training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba, multi_class="ovr")

    # ✅ Log Environment Automatically
    mlflow.log_params({
        "python_version": platform.python_version(),
        "os_platform": f"{platform.system()} {platform.release()}",
        "sklearn_version": sklearn.__version__,
        "pandas_version": pd.__version__,
        "numpy_version": np.__version__,
        "matplotlib_version": matplotlib.__version__,
        "seaborn_version": sns.__version__,
        "shap_version": shap.__version__,
    })

    # ✅ Git and Notebook Metadata
    mlflow.set_tag("git_commit_hash", commit_hash)
    mlflow.set_tag("notebook_name", "RQ1.ipynb")

    # ✅ Dataset Metadata Tags
    mlflow.set_tag("dataset_name", "Iris")
    mlflow.set_tag("dataset_version", "1.0.0")
    mlflow.set_tag("dataset_id", "iris_local")

    # ✅ Confusion Matrix Plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = "confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)

    # ✅ SHAP Summary
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values, X_test, show=False)
    shap_path = "shap_summary.png"
    plt.savefig(shap_path)
    mlflow.log_artifact(shap_path)

    # ✅ FAIR4ML-style Metadata JSON
    fair4ml_metadata = {
        "@type": "MLModel",
        "name": model_name,
        "algorithm": "RandomForestClassifier",
        "hyperParameters": model.get_params(),
        "trainingDataset": {
            "name": "Iris",
            "version": "1.0.0",
            "identifier": "iris_local"
        },
        "trainingMetrics": {
            "accuracy": acc,
            "roc_auc": auc,
            "precision": precision_score(y_test, y_pred, average='macro'),
            "recall": recall_score(y_test, y_pred, average='macro'),
            "f1_score": f1_score(y_test, y_pred, average='macro')
        },
        "environment": {
            "python": platform.python_version(),
            "os": f"{platform.system()} {platform.release()}",
            "libraries": {
                "sklearn": sklearn.__version__,
                "pandas": pd.__version__,
                "numpy": np.__version__
            }
        },
        "source": {
            "git_commit": commit_hash,
            "notebook": "RQ1.ipynb"
        }
    }
    # ✅ Commit changes to Git after successful run
    repo.git.add(A=True)
    repo.index.commit("Auto commit after successful training")
    repo.remotes.origin.push()
    
    # 🔁 Get new commit hash
    new_commit_hash = repo.head.object.hexsha
    
    # 🔁 Log updated Git commit
    mlflow.set_tag("git_commit_hash", new_commit_hash)
    mlflow.log_param("git_commit_hash", new_commit_hash)

    with open("model_metadata_fair4ml.json", "w") as f:
        json.dump(fair4ml_metadata, f, indent=2)
    mlflow.log_artifact("model_metadata_fair4ml.json")
   
     # 🔔 OPTIONAL: Notify outdated forks if flag is True
    notify_collaborators = True
    load_dotenv()
    GITHUB_TOKEN = os.getenv("THESIS_TOKEN")  # or "GITHUB_TOKEN" if you named it that

    if not GITHUB_TOKEN:
        print("⚠️ GITHUB_TOKEN not set.")    
    GITHUB_OWNER="reema-dass26"
    GITHUB_REPO="https://github.com/reema-dass26/Provenence-Tracking-Thesis-Research.git"

    if notify_collaborators and GITHUB_TOKEN:
        headers = {
            "Authorization": f"token {GITHUB_TOKEN}",
            "Accept": "application/vnd.github.v3+json"
        }
        forks_url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/forks"
        response = requests.get(forks_url, headers=headers)

        if response.status_code == 200:
            forks = response.json()
            outdated_users = []
            for fork in forks:
                fork_owner = fork['owner']['login']
                fork_commit_url = fork['url'] + "/commits"
                fork_response = requests.get(fork_commit_url, headers=headers)

                if fork_response.status_code == 200:
                    fork_commits = fork_response.json()
                    fork_latest = fork_commits[0]['sha'] if fork_commits else None
                    if fork_latest and fork_latest != new_commit_hash:
                        outdated_users.append(f"@{fork_owner}")

            if outdated_users:
                issue_title = "🔔 Notification: Your fork is behind the latest commit"
                issue_body = f"Hi {' '.join(outdated_users)},\n\nThe main repository has been updated to commit `{new_commit_hash}`.\nPlease consider pulling the latest changes to stay in sync.\n\nThanks!"
                issues_url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/issues"
                issue_data = {"title": issue_title, "body": issue_body}
                requests.post(issues_url, headers=headers, json=issue_data)

        mlflow.end_run()


SyntaxError: invalid syntax (683287625.py, line 111)

Goal: Notify collaborators who have forked the GitHub repo if their fork is outdated (i.e., behind the current commit used to train a model).

🧠 What We Need
Current training run’s Git commit hash

GitHub API to fetch all forks of your repo

Compare each fork’s main or master branch head commit

Create an issue on their fork or on your repo tagging them if they’re behind

Option 1 (Practical): Notify via issues on your own repo