In [None]:
!pip uninstall mlflow

In [1]:
import mlflow
from mlflow.models import infer_signature
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



Fetching data from DBREPO using API ENDPOINT

In [1]:
import requests

# API endpoint URL
url = "http://localhost/api/database/c3a42d17-42b7-43c9-a504-2363fb4c9c8d/table/991f4e40-4524-4dd5-a94b-aa0bf4862f6f/data?size=100000&page=0"

# Define the headers
headers = {
    "Accept": "application/json"  # Specify the expected response format
}

try:
    # Send a GET request to the API with the Accept header
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        dataset = response.json()
        print("API Response:", dataset)
        print( dataset.count)
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)
       

except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")


API Response: [{'id': '1', 'sepallengthcm': '5.200000000000000000', 'sepalwidthcm': '3.500000000000000000', 'petallengthcm': '1.400000000000000000', 'petalwidthcm': '0.200000000000000000', 'species': 'Iris-setosa'}, {'id': '2', 'sepallengthcm': '4.900000000000000000', 'sepalwidthcm': '3.000000000000000000', 'petallengthcm': '1.400000000000000000', 'petalwidthcm': '0.200000000000000000', 'species': 'Iris-setosa'}, {'id': '3', 'sepallengthcm': '4.700000000000000000', 'sepalwidthcm': '3.200000000000000000', 'petallengthcm': '1.300000000000000000', 'petalwidthcm': '0.200000000000000000', 'species': 'Iris-setosa'}, {'id': '4', 'sepallengthcm': '4.600000000000000000', 'sepalwidthcm': '3.100000000000000000', 'petallengthcm': '1.500000000000000000', 'petalwidthcm': '0.200000000000000000', 'species': 'Iris-setosa'}, {'id': '5', 'sepallengthcm': '5.000000000000000000', 'sepalwidthcm': '3.600000000000000000', 'petallengthcm': '1.400000000000000000', 'petalwidthcm': '0.200000000000000000', 'specie

In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import json
import joblib
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix



# Convert JSON data into DataFrame
df = pd.DataFrame(dataset)

# Convert string numeric values to float
df[['sepallengthcm', 'sepalwidthcm', 'petallengthcm', 'petalwidthcm']] = df[
    ['sepallengthcm', 'sepalwidthcm', 'petallengthcm', 'petalwidthcm']].astype(float)

# Drop 'id' column if not needed
df.drop(columns=['id'], inplace=True)

# Define features (X) and target (y)
X = df[['sepallengthcm', 'sepalwidthcm', 'petallengthcm', 'petalwidthcm']]
y = df['species']

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("IRIS_PROVENANCE")
mlflow.autolog(log_models=False, exclusive=True)

with mlflow.start_run():
    # mlflow.log_param("dataset", "Iris")
    # mlflow.log_param("features", list(X.columns))
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Impute missing values
    imputer = SimpleImputer(strategy="mean")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    # mlflow.log_param("imputation_strategy", "mean")
    
    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # mlflow.log_param("scaling_method", "StandardScaler")
    
    # Train RandomForest model
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    # mlflow.log_param("model_choice", "RandomForestClassifier")
    # mlflow.log_param("n_estimators", 100)
    # mlflow.log_param("max_depth", 10)
    
    # Predictions and evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # mlflow.log_metric("accuracy", accuracy)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=y.unique(), yticklabels=y.unique())
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    # mlflow.log_artifact("confusion_matrix.png")
    
    # Feature Importance
    feature_importance = model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(X.columns, feature_importance)
    plt.title("Feature Importance")
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    # mlflow.log_artifact("feature_importance.png")
    
    # Save model as .pkl file
    model_filename = "RF_iris.pkl"
    joblib.dump(model, model_filename)


    # mlflow.log_artifact(json_filename)
  

    mlflow.end_run()

NameError: name 'dataset' is not defined

In [5]:
# Specify the name of the experiment you're looking for
experiment_name = "IRIS_PROVENANCE"

# Filter experiments by name
for experiment in experiments.get('experiments', []):
    if experiment['name'] == experiment_name:
        print(f"Found Experiment ID: {experiment['experiment_id']}")


NameError: name 'experiments' is not defined

In [7]:
import requests

# Set the URL for the MLflow REST API
mlflow_url = "http://127.0.0.1:5000/api/2.0/mlflow/experiments/IRIS_PROVENANCE"

# Define the experiment name you want to query
experiment_name = "IRIS_PROVENANCE"  # Replace with the name of your experiment

# Make a GET request to fetch the experiment details by name
response = requests.get(mlflow_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse and print the response
    experiment_data = response.json()
    print(experiment_data)
else:
    print(f"Error: {response.status_code}, {response.text}")


Error: 404, <!doctype html>
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>



Logging metadat from the DBREPO from all the possible API end points

In [13]:
import requests
import os
import json

# API endpoint URL  Databse details
url = "http://localhost/api/database/c3a42d17-42b7-43c9-a504-2363fb4c9c8d"

try:
    # Send a GET request to the API
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        print("API Response:", data)
  
        data = response.json()
        folder_path = "Code_fetched_metadata_dbrep"
        os.makedirs(folder_path, exist_ok=True)
        
                # Define the filename for the JSON file
        json_filename = os.path.join(folder_path, "DB_REPO_database_details.json")
        
                # Save the entire response as a JSON file
        with open(json_filename, "w") as json_file:
                json.dump(data, json_file, indent=4)
        
        print(f"API response saved to {json_filename}")
   
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

API Response: {'id': 'c3a42d17-42b7-43c9-a504-2363fb4c9c8d', 'name': 'Iris', 'description': None, 'tables': [{'id': '991f4e40-4524-4dd5-a94b-aa0bf4862f6f', 'name': 'iris_table', 'alias': None, 'identifiers': [], 'owner': {'id': '82520286-97b2-103f-8282-b1a6f55cd3ef', 'username': 'reema', 'name': 'Reema George Dass', 'orcid': None, 'qualified_name': 'Reema George Dass — @reema', 'given_name': 'Reema George', 'family_name': 'Dass'}, 'description': 'table desc', 'columns': [{'id': '75811fa4-5d84-4e37-b256-09fafdb00f6a', 'name': 'Id', 'alias': None, 'size': None, 'd': None, 'mean': 76, 'median': 76, 'concept': None, 'unit': None, 'description': None, 'enums': [], 'sets': [], 'database_id': 'c3a42d17-42b7-43c9-a504-2363fb4c9c8d', 'table_id': '991f4e40-4524-4dd5-a94b-aa0bf4862f6f', 'ord': 0, 'internal_name': 'id', 'index_length': None, 'length': None, 'type': 'bigint', 'data_length': None, 'max_data_length': None, 'num_rows': None, 'val_min': None, 'val_max': None, 'std_dev': 43, 'is_null_al

Logging DBREPO VIEW

In [6]:
import requests

# API endpoint URL
url = "http://localhost/api/database/c3a42d17-42b7-43c9-a504-2363fb4c9c8d/view"

try:
    # Send a GET request to the API
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        folder_path = "Code_fetched_metadata_dbrep"
        os.makedirs(folder_path, exist_ok=True)
        
                # Define the filename for the JSON file
        json_filename = os.path.join(folder_path, "DB_REPO_view_details.json")
        
                # Save the entire response as a JSON file
        with open(json_filename, "w") as json_file:
                json.dump(data, json_file, indent=4)
        
        print(f"API response saved to {json_filename}")
   
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

API response saved to Code_fetched_metadata_dbrep\DB_REPO_view_details.json


In [7]:
# import requests

# # API endpoint URL
# url = "http://localhost/api/database/c3a42d17-42b7-43c9-a504-2363fb4c9c8d/info"

# try:
#     # Send a GET request to the API
#     response = requests.get(url)

#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the JSON response
#         data = response.json()
#         folder_path = "Code_fetched_metadata_dbrep"
#         os.makedirs(folder_path, exist_ok=True)
        
#                 # Define the filename for the JSON file
#         json_filename = os.path.join(folder_path, "DB_REPO_info_details.json")
        
#                 # Save the entire response as a JSON file
#         with open(json_filename, "w") as json_file:
#                 json.dump(data, json_file, indent=4)
        
#         print(f"API response saved to {json_filename}")
   
#     else:
#         print(f"Error: Received status code {response.status_code}")
#         print("Response content:", response.text)
# except requests.exceptions.RequestException as e:
#     print(f"Request failed: {e}")

Logging DBREPO HISTORY

In [8]:
url = "http://localhost/api/database/c3a42d17-42b7-43c9-a504-2363fb4c9c8d/table/991f4e40-4524-4dd5-a94b-aa0bf4862f6f/history"

try:
    # Send a GET request to the API
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        folder_path = "Code_fetched_metadata_dbrep"
        os.makedirs(folder_path, exist_ok=True)
        
                # Define the filename for the JSON file
        json_filename = os.path.join(folder_path, "DB_REPO_history_details.json")
        
                # Save the entire response as a JSON file
        with open(json_filename, "w") as json_file:
                json.dump(data, json_file, indent=4)
        
        print(f"API response saved to {json_filename}")
   
    else:
        print(f"Error: Received status code {response.status_code}")
        print("Response content:", response.text)
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

API response saved to Code_fetched_metadata_dbrep\DB_REPO_history_details.json


DOI, NOT from DBREPO, but just directly...-----> remember this for suggestions

In [9]:
import requests

# Define the API URL
url = "https://doi.org/10.5281/zenodo.1404173"

# Define headers to request citation data in JSON format
headers = {
    "Accept": "application/vnd.citationstyles.csl+json"
}

try:
    # Send GET request
    response = requests.get(url, headers=headers)

    # Check if request was successful
    if response.status_code == 200:
        data = response.json()  # Parse JSON response
        print("Title:", data.get("title"))
        print("Author:", data.get("author", [{}])[0].get("family", "Unknown"), data.get("author", [{}])[0].get("given", ""))
        print("DOI:", data.get("DOI"))
        print("Published Date:", data.get("issued", {}).get("date-parts", [[]])[0])
        print("Publisher:", data.get("publisher"))
        print("URL:", data.get("URL"))
        print("Abstract:", data.get("abstract"))
    else:
        print(f"Error {response.status_code}: {response.text}")

except requests.exceptions.RequestException as e:
    print("Request failed:", e)


Title: Scikit-Learn Iris
Author: Marshall Michael
DOI: 10.5281/ZENODO.1404173
Published Date: [2018, 8, 27]
Publisher: Zenodo
URL: https://zenodo.org/record/1404173
Abstract: <strong>Data Set Characteristics:</strong>

Number of Instances:
150 (50 in each of three classes)
Number of Attributes:

4 numeric, predictive attributes and the class
Attribute Information:


	sepal length in cm
	sepal width in cm
	petal length in cm
	petal width in cm
	
	class:

	
		Iris-Setosa
		Iris-Versicolour
		Iris-Virginica


Note: from Martin abiut the current limitations

<!-- The DOI retrieval seems to be buggy at the moment, I opened a ticket to fix it. The problem is that this DOI seems not be minted by Crossref, but DataCite which is not supported at the moment. But you can circumvent this by using the DOI endpoint directly in your script (e.g. by using the requests library) by setting the "Accept" header and calling doi.org directly.



$ curl -sSL https://doi.org/10.5281/zenodo.1404173 -H "Accept: application/vnd.citationstyles.csl+json"

{
  "type": "dataset",
  "id": "https://doi.org/10.5281/zenodo.1404173",
  "author": [
    {
      "family": "Marshall",
      "given": "Michael"
    }
  ],
  "issued": {
    "date-parts": [
      [
        2018,
        8,
        27
      ]
    ]
  },
  "abstract": "<strong>Data Set Characteristics:</strong>\n\nNumber of Instances:\n150 (50 in each of three classes)\nNumber of Attributes:\n\n4 numeric, predictive attributes and the class\nAttribute Information:\n\n\n\tsepal length in cm\n\tsepal width in cm\n\tpetal length in cm\n\tpetal width in cm\n\t\n\tclass:\n\n\t\n\t\tIris-Setosa\n\t\tIris-Versicolour\n\t\tIris-Virginica",
  "DOI": "10.5281/ZENODO.1404173",
  "publisher": "Zenodo",
  "title": "Scikit-Learn Iris",
  "URL": "https://zenodo.org/record/1404173",
  "copyright": "Creative Commons Attribution 4.0"
}

url="http://localhost/api/oai?metadataPrefix=oai_dc&from=2025-03-01&until=2025-03-07&set=Databases%2F6c1c5dbd-d471-416f-997f-cf5bf72943e4%2FTables%2F1d8801b0-05b8-4066-b7d1-e5c1da11768f&resumptionToken=string&fromDate=2025-03-07T19%3A35%3A51.476Z&untilDate=2025-03-07T19%3A35%3A51.476Z&parametersString=string"

The OAI-PMH Protocol is not fully implemented, only the bare minimum verbs of verb=Identify, verb=ListIdentifiers, verb=GetRecord, verb=ListMetadataFormats, the endpoint is not easy to document in Swagger so here is the implementation: https://gitlab.phaidra.org/fair-data-austria-db-repository/fda-services/-/blob/master/dbrepo-metadata-service/rest-service/src/main/java/at/tuwien/endpoints/MetadataEndpoint.java?ref_type=heads it does not respect most of the parameters and is intended to support the identifier discovery for legacy systems. -->

Statistic, export, OAIschema isnt at the moment working some are not working for me and i have asked, some needs more implementation from their side.

Validating the dataset 

id               150
sepallengthcm    150
sepalwidthcm     150
petallengthcm    150
petalwidthcm     150
species          150
dtype: int64


In [7]:
print(y.value_counts())


species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


Training the model for IRIS 

In [12]:
accuracy

1.0

Logging essential parameters in MLFLOW

2025/03/28 10:26:35 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Quickstart' does not exist. Creating a new experiment.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'tracking-quickstart'.
2025/03/28 10:26:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 1


🏃 View run secretive-moth-68 at: http://127.0.0.1:5000/#/experiments/452543657920654770/runs/70c02c37693e4901b3de078909a7b540
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/452543657920654770


Created version '1' of model 'tracking-quickstart'.


To be integreted from here

In [1]:
import requests
import json

# Define the URL and token
url = "https://127.0.0.1:5000/api/records"
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"  

# Define the headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}"
}

# Define the data to be sent in the request
data = {
    "title": "My trined ML model",
    "description": "This is a sample unstructed data of a simple trained ML model",
    "creator": "Reema Dass"
}

# Send the POST request
response = requests.post(url, headers=headers, data=json.dumps(data), verify=False)

# Print the response
print(response.status_code)
print(response.json())
# print(json.dumps(response, indent=4, ensure_ascii=False))



201
{'id': '3m20n-wwx06', 'created': '2025-03-01T14:48:25.197185+00:00', 'updated': '2025-03-01T14:48:25.231340+00:00', 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft', 'self_html': 'https://127.0.0.1:5000/uploads/3m20n-wwx06', 'self_iiif_manifest': 'https://127.0.0.1:5000/api/iiif/draft:3m20n-wwx06/manifest', 'self_iiif_sequence': 'https://127.0.0.1:5000/api/iiif/draft:3m20n-wwx06/sequence/default', 'files': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files', 'record': 'https://127.0.0.1:5000/api/records/3m20n-wwx06', 'record_html': 'https://127.0.0.1:5000/records/3m20n-wwx06', 'publish': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/actions/publish', 'review': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/review', 'versions': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/versions', 'access_links': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/access/links', 'reserve_doi': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/pids

In [2]:
import requests
import json

# Define the URL and token
url = "https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files"
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"  

# Define the headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}"
}

# Define the data to be sent in the request
data = [
    {"key": "ML_model"}
]

# Send the POST request
response = requests.post(url, headers=headers, data=json.dumps(data), verify=False)

# Print the response
print(response.status_code)
print(response.json())


201
{'enabled': True, 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files'}, 'entries': [{'key': 'ML_model', 'created': '2025-03-01T14:49:49.575608+00:00', 'updated': '2025-03-01T14:49:49.580532+00:00', 'status': 'pending', 'metadata': None, 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model', 'content': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/content', 'commit': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/commit'}}], 'default_preview': None, 'order': []}




In [3]:
import requests

# Define the URL and token
url = "https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/content"
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"  

# Define the headers
headers = {
    "Content-Type": "application/octet-stream",
    "Authorization": f"Bearer {token}"
}

# Open the file you want to upload in binary mode
with open('iris_classifier.pkl', 'rb') as file:  # Replace with the actual .pkl file name
    # Send the PUT request
    response = requests.put(url, headers=headers, data=file, verify=False)

# Print the response
print(response.status_code)
print(response.json())


200
{'key': 'ML_model', 'created': '2025-03-01T14:49:49.575608+00:00', 'updated': '2025-03-01T14:49:49.580532+00:00', 'status': 'pending', 'metadata': None, 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model', 'content': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/content', 'commit': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/commit'}}




In [4]:
import requests

# Define the URL and token
url = "https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/commit"
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"  

# Define the headers
headers = {
    "Authorization": f"Bearer {token}"
}

# Send the POST request
response = requests.post(url, headers=headers, verify=False)

# Print the response
print(response.status_code)
print(response.json())


200
{'key': 'ML_model', 'created': '2025-03-01T14:49:49.575608+00:00', 'updated': '2025-03-01T14:55:32.327577+00:00', 'status': 'completed', 'metadata': None, 'checksum': 'md5:be46b8705399e18cfebe756edb30c567', 'storage_class': 'S', 'mimetype': 'application/octet-stream', 'size': 184273, 'version_id': 'dfd693b5-b532-498a-8523-b2c4856fd258', 'file_id': '7e51da7e-8842-43b5-8aef-79de584e93bd', 'bucket_id': 'e897e7ae-0e0e-4b3b-aea0-78cadbfe5226', 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model', 'content': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/content', 'commit': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/files/ML_model/commit'}}




In [5]:
import requests

# Define the URL to retrieve metadata
record_id = "3m20n-wwx06"  # Replace with your actual record ID
url = f"https://127.0.0.1:5000/api/records/{record_id}"

# Your authorization token
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"

# Define the headers
headers = {
    "Authorization": f"Bearer {token}"
}

# Send the GET request to retrieve metadata
response = requests.get(url, headers=headers, verify=False)

# Check if the request was successful
if response.status_code == 200:
    metadata = response.json()
    print("Metadata Retrieved Successfully!")
    print(metadata)
else:
    print("Error:", response.status_code)
    print(response.text)


Metadata Retrieved Successfully!
{'id': '3m20n-wwx06', 'created': '2025-03-01T15:02:49.883528+00:00', 'updated': '2025-03-01T15:02:49.996396+00:00', 'links': {'self': 'https://127.0.0.1:5000/api/records/3m20n-wwx06', 'self_html': 'https://127.0.0.1:5000/records/3m20n-wwx06', 'self_iiif_manifest': 'https://127.0.0.1:5000/api/iiif/record:3m20n-wwx06/manifest', 'self_iiif_sequence': 'https://127.0.0.1:5000/api/iiif/record:3m20n-wwx06/sequence/default', 'files': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/files', 'latest': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/versions/latest', 'latest_html': 'https://127.0.0.1:5000/records/3m20n-wwx06/latest', 'draft': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft', 'versions': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/versions', 'access_links': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/access/links', 'reserve_doi': 'https://127.0.0.1:5000/api/records/3m20n-wwx06/draft/pids/doi'}, 'revision_id': 3, 'parent': {'id': 'q31fy



In [6]:
import requests

# Define the URL to retrieve metadata
record_id = "3m20n-wwx06"  
url = f"https://127.0.0.1:5000/api/records/{record_id}"

# Your authorization token
token = "FGIxr9BpYjRNO5MFhxcRAnLeIqvlLmXxtoA9qNcrLKw9lkCrIj29CplQkFzk"

# Define the headers
headers = {
    "Authorization": f"Bearer {token}"
}

# Send the GET request to retrieve metadata
response = requests.get(url, headers=headers, verify=False)

if response.status_code == 200:
    record = response.json()
    print("Detailed Metadata:")
    metadata = record.get('metadata', {})
    print(json.dumps(metadata, indent=4, ensure_ascii=False))
    print('metadata keys')
    for key in metadata.keys(): print (key)
else:
    print(f"Failed to fetch metadata. Status code: {response.status_code}")
    print(response.text)



Detailed Metadata:
{
    "resource_type": {
        "id": "publication-other",
        "title": {
            "de": "Sonstige",
            "en": "Other"
        }
    },
    "creators": [
        {
            "person_or_org": {
                "type": "personal",
                "name": "Dass, Reema George",
                "given_name": "Reema George",
                "family_name": "Dass"
            },
            "role": {
                "id": "other",
                "title": {
                    "en": "Other"
                }
            }
        }
    ],
    "title": "ML model",
    "publication_date": "2025",
    "dates": [
        {
            "date": "2025-02",
            "type": {
                "id": "created",
                "title": {
                    "de": "Erstellt",
                    "en": "Created"
                }
            }
        }
    ],
    "description": "<p>This a trained ml model upload</p>"
}
metadata keys
resource_type
creators
title
publ



Jupyter notebook metadata

In [27]:
import nbformat

# Load the notebook
with open('final_infra.ipynb', 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

# Extract metadata
metadata = notebook.metadata
print("Metadata:", metadata)

Metadata: {'kernelspec': {'display_name': 'Python 3 (ipykernel)', 'language': 'python', 'name': 'python3'}, 'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3}, 'file_extension': '.py', 'mimetype': 'text/x-python', 'name': 'python', 'nbconvert_exporter': 'python', 'pygments_lexer': 'ipython3', 'version': '3.11.5'}}


In [28]:

#Input:https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo
import nbformat
import json
import os
from datetime import datetime

# Load the notebook file
notebook_path = "final_infra.ipynb"
with open(notebook_path, "r", encoding="utf-8") as f:
    notebook = nbformat.read(f, as_version=4)

# Extract notebook metadata
notebook_metadata = {
    "notebook_metadata": notebook.metadata,
    "cells": []
}

# Extract execution metadata from each cell
for idx, cell in enumerate(notebook.cells):
    if cell.cell_type == "code":
        execution_info = {
            "cell_number": idx,
            "execution_count": cell.execution_count,
            "outputs": [output for output in cell.outputs],  # Output data
            "source": cell.source.strip()  # Remove unnecessary whitespace
        }
        notebook_metadata["cells"].append(execution_info)

# Convert to valid JSON
metadata_json = json.dumps(notebook_metadata, indent=4)

# Create a directory if it doesn't exist
output_dir = "Code_fetched_Metadata_notebook_metadata"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamp for filename
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file = os.path.join(output_dir, f"notebook_metadata_{timestamp}.json")

# Save the metadata JSON file
with open(output_file, "w", encoding="utf-8") as f:
    f.write(metadata_json)

print(f"Metadata saved in: {output_file}")


Metadata saved in: Code_fetched_Metadata_notebook_metadata\notebook_metadata_2025-03-28_17-56-23.json


GITHUB METADATA 

In [31]:
import requests
import json
import os
from datetime import datetime

def get_github_repo_metadata(repo_url):
    # Extract owner and repo name from URL
    parts = repo_url.rstrip('/').split('/')
    if len(parts) < 2:
        print("Invalid GitHub repository URL.")
        return
    
    owner, repo = parts[-2], parts[-1]
    api_url = f"https://api.github.com/repos/{owner}/{repo}"

    # Fetch metadata from GitHub API
    response = requests.get(api_url)
    
    if response.status_code == 200:
        metadata = response.json()

        # Convert to JSON format
        metadata_json = json.dumps(metadata, indent=4)

        # Create directory if it doesn't exist
        output_dir = "Code_fetched_Metadata_github_repo_metadata"
        os.makedirs(output_dir, exist_ok=True)

        # Generate timestamped filename
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        output_file = os.path.join(output_dir, f"{repo}_metadata_{timestamp}.json")

        # Save JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(metadata_json)

        print(f"Metadata saved in: {output_file}")
    else:
        print(f"Failed to fetch repository metadata. HTTP Status: {response.status_code}")

# Example usage
repo_url = input("Enter GitHub repository URL: ")
get_github_repo_metadata(repo_url)
#  https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo

Enter GitHub repository URL:   https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo


Metadata saved in: Code_fetched_Metadata_github_repo_metadata\Thesis_Infrastructure_setup_repo_metadata_2025-03-28_17-59-30.json


Validation if all necessary github meta is received

In [32]:
# import requests
# import json
# import os
# from datetime import datetime

# # Expected structure based on "github_repository_metadata_longer_version"
# EXPECTED_FIELDS = {
#     "repo_id": None, "repo_name": None,
#     "owner": {"username": None, "profile_url": None},
#     "created_date": None, "modified_date": None,
#     "primary_language": None, "topics": [],
#     "description": None, "contributors": [],
#     "license": {"license_name": None, "license_url": None},
#     "visibility": None,
#     "fork_status": {"is_forked": None, "original_repo": None},
#     "source_code": {"repo_url": None, "default_branch": None},
#     "dependencies": [],
#     "commits": {
#         "total_commits": None, "latest_commit_hash": None,
#         "latest_commit_timestamp": None, "commit_authors": []
#     },
#     "pull_requests": {"total_prs": None, "merged_prs": None, "open_prs": None, "closed_prs": None},
#     "workflow_runs": {"ci_cd_provider": None, "total_runs": None, "last_run_status": None, "last_run_timestamp": None},
#     "issues": {"total_issues": None, "open_issues": None, "closed_issues": None},
#     "discussion_threads": {"total_discussions": None, "open_discussions": None, "resolved_discussions": None},
#     "forks": None, "stars": None, "watchers": None,
#     "readme": {"readme_url": None, "readme_last_updated": None},
#     "code_quality": {"linting_status": None, "test_coverage": None, "static_analysis_tools": []},
#     "security_scans": {
#         "vulnerability_scan_results": None,
#         "critical_vulnerabilities": None,
#         "high_vulnerabilities": None,
#         "medium_vulnerabilities": None
#     },
#     "documentation": {"docs_url": None, "doc_coverage": None, "api_documentation": None},
#     "releases": {"latest_release_version": None, "latest_release_date": None, "release_notes_url": None},
#     "modification_history": {"change_log_url": None, "release_history": []},
#     "derived_from": None,
#     "reproducibility_guidelines": {
#         "setup_steps": None, "dataset_references": [],
#         "hardware_requirements": None, "docker_setup": None, "virtual_environment": None
#     },
#     "collaboration_tools": {"discussion_enabled": None, "projects_enabled": None, "wiki_enabled": None}
# }

# def get_github_repo_metadata(repo_url):
#     # Extract owner and repo name from URL
#     parts = repo_url.rstrip('/').split('/')
#     if len(parts) < 2:
#         print("Invalid GitHub repository URL.")
#         return
    
#     owner, repo = parts[-2], parts[-1]
#     api_url = f"https://api.github.com/repos/{owner}/{repo}"

#     # Fetch metadata from GitHub API
#     response = requests.get(api_url)
    
#     if response.status_code == 200:
#         metadata = response.json()

#         # Extract relevant fields
#         repo_metadata = {
#             "repo_id": metadata.get("id"),
#             "repo_name": metadata.get("name"),
#             "owner": {
#                 "username": metadata["owner"]["login"],
#                 "profile_url": metadata["owner"]["html_url"]
#             },
#             "created_date": metadata.get("created_at", "").split("T")[0],
#             "modified_date": metadata.get("updated_at", "").split("T")[0],
#             "primary_language": metadata.get("language"),
#             "topics": metadata.get("topics", []),
#             "description": metadata.get("description"),
#             "license": {
#                 "license_name": metadata["license"]["name"] if metadata.get("license") else None,
#                 "license_url": metadata["license"]["url"] if metadata.get("license") else None
#             },
#             "visibility": metadata.get("visibility"),
#             "forks": metadata.get("forks_count"),
#             "stars": metadata.get("stargazers_count"),
#             "watchers": metadata.get("watchers_count"),
#             "source_code": {
#                 "repo_url": metadata.get("html_url"),
#                 "default_branch": metadata.get("default_branch")
#             }
#         }

#         # Fetch contributors
#         contributors_url = metadata["contributors_url"]
#         contributors_resp = requests.get(contributors_url)
#         if contributors_resp.status_code == 200:
#             repo_metadata["contributors"] = [
#                 {
#                     "username": user["login"],
#                     "profile_url": user["html_url"],
#                     "contributions": user["contributions"]
#                 } for user in contributors_resp.json()
#             ]

#         # Fetch commit details
#         commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
#         commits_resp = requests.get(commits_url)
#         if commits_resp.status_code == 200 and len(commits_resp.json()) > 0:
#             latest_commit = commits_resp.json()[0]
#             repo_metadata["commits"] = {
#                 "total_commits": len(commits_resp.json()),
#                 "latest_commit_hash": latest_commit["sha"],
#                 "latest_commit_timestamp": latest_commit["commit"]["committer"]["date"].split("T")[0],
#                 "commit_authors": list(set(c["commit"]["author"]["name"] for c in commits_resp.json()))
#             }

#         # Identify missing fields
#         missing_fields = []
#         def compare_dicts(expected, actual, prefix=""):
#             for key, value in expected.items():
#                 current_path = f"{prefix}{key}"
#                 if isinstance(value, dict):
#                     compare_dicts(value, actual.get(key, {}), current_path + ".")
#                 elif key not in actual:
#                     missing_fields.append(current_path)

#         compare_dicts(EXPECTED_FIELDS, repo_metadata)

#         # Save metadata as JSON
#         output_dir = "github_repo_metadata"
#         os.makedirs(output_dir, exist_ok=True)

#         timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
#         output_file = os.path.join(output_dir, f"{repo}_metadata_{timestamp}.json")

#         with open(output_file, "w", encoding="utf-8") as f:
#             json.dump(repo_metadata, f, indent=4)

#         print(f"Metadata saved in: {output_file}")
#         print("\nMissing Fields:", missing_fields)

#     else:
#         print(f"Failed to fetch repository metadata. HTTP Status: {response.status_code}")

# # Example usage
# repo_url = input("Enter GitHub repository URL: ")
# get_github_repo_metadata(repo_url)
# # https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo

Enter GitHub repository URL:  https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo


Metadata saved in: github_repo_metadata\Thesis_Infrastructure_setup_repo_metadata_2025-03-28_18-01-16.json

Missing Fields: ['fork_status.is_forked', 'fork_status.original_repo', 'dependencies', 'pull_requests.total_prs', 'pull_requests.merged_prs', 'pull_requests.open_prs', 'pull_requests.closed_prs', 'workflow_runs.ci_cd_provider', 'workflow_runs.total_runs', 'workflow_runs.last_run_status', 'workflow_runs.last_run_timestamp', 'issues.total_issues', 'issues.open_issues', 'issues.closed_issues', 'discussion_threads.total_discussions', 'discussion_threads.open_discussions', 'discussion_threads.resolved_discussions', 'readme.readme_url', 'readme.readme_last_updated', 'code_quality.linting_status', 'code_quality.test_coverage', 'code_quality.static_analysis_tools', 'security_scans.vulnerability_scan_results', 'security_scans.critical_vulnerabilities', 'security_scans.high_vulnerabilities', 'security_scans.medium_vulnerabilities', 'documentation.docs_url', 'documentation.doc_coverage', 'd

all commit history

In [33]:
# import requests
# import json
# import os

# # GitHub API URL for getting the commits
# repo_api_url = "https://api.github.com/repos/reema-dass26/Thesis_Infrastructure_setup_repo/commits"

# # Create a new directory to store the output file (optional)
# output_dir = "Code_fetched_Metadata_commit_details"
# os.makedirs(output_dir, exist_ok=True)

# # Output file to store all commit data
# output_filename = os.path.join(output_dir, "all_commit_details.json")

# # List to store all commit details
# all_commits_data = []

# # Fetch the commit history
# response = requests.get(repo_api_url)

# if response.status_code == 200:
#     commits = response.json()
    
#     # Loop through all the commits
#     for commit in commits:
#         # Extract commit details
#         commit_sha = commit['sha']
#         commit_message = commit['commit']['message']
#         commit_date = commit['commit']['author']['date']
#         commit_author = commit['commit']['author']['name']

#         print(f"Processing Commit SHA: {commit_sha}")
        
#         # Construct the URL to get the notebook content at this commit
#         notebook_url = f"https://raw.githubusercontent.com/reema-dass26/Thesis_Infrastructure_setup_repo/{commit_sha}/final_infra.ipynb"
        
#         # Fetch the notebook content
#         notebook_response = requests.get(notebook_url)
        
#         # Create the data structure to hold both commit and notebook metadata
#         commit_data = {
#             "commit_details": {
#                 "sha": commit_sha,
#                 "message": commit_message,
#                 "date": commit_date,
#                 "author": commit_author
#             }
#         }

#         # If the notebook was found, add its metadata to the commit data
#         if notebook_response.status_code == 200:
#             notebook_content = notebook_response.json()
#             notebook_metadata = notebook_content.get('metadata', {})
#             commit_data["notebook_metadata"] = notebook_metadata
#         else:
#             commit_data["notebook_metadata"] = "Not Found"

#         # Add the commit data to the list of all commits
#         all_commits_data.append(commit_data)
#         print(f"Commit details added to list for {commit_sha}")
    
#     # Write all commit details and notebook metadata to a single JSON file
#     with open(output_filename, 'w') as outfile:
#         json.dump(all_commits_data, outfile, indent=4)

#     print(f"All commit details have been saved to {output_filename}")

# else:
#     print("Failed to fetch commits.")


Processing Commit SHA: 8d8a2cc34778ad4fd87346e17a0bca2e185df74b
Commit details added to list for 8d8a2cc34778ad4fd87346e17a0bca2e185df74b
Processing Commit SHA: a462afea2bb229fb7484c70f78e5877062400ef7
Commit details added to list for a462afea2bb229fb7484c70f78e5877062400ef7
Processing Commit SHA: f0ba7b3daf5e230c60420d678136e009d8649c6e
Commit details added to list for f0ba7b3daf5e230c60420d678136e009d8649c6e
Processing Commit SHA: f33505d5f2b50bf9a33d5a98756edfb9c11fbfdd
Commit details added to list for f33505d5f2b50bf9a33d5a98756edfb9c11fbfdd
Processing Commit SHA: 410b6393a984a1c9224df3f388464d6a3692c50d
Commit details added to list for 410b6393a984a1c9224df3f388464d6a3692c50d
Processing Commit SHA: 60eea06d5e756e301ae20f2b2d48df53987cc93f
Commit details added to list for 60eea06d5e756e301ae20f2b2d48df53987cc93f
Processing Commit SHA: fdf732a5e9d5285f1ca85aedf6b335014322ab9a
Commit details added to list for fdf732a5e9d5285f1ca85aedf6b335014322ab9a
Processing Commit SHA: 99014de333c

In [53]:
import mlflow
import mlflow.sklearn
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# Load Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Define features (X) and target (y)
X = df[iris.feature_names]
y = df['species']

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("IRIS_PROVENANCE")
mlflow.sklearn.autolog()

with mlflow.start_run():
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Impute missing values
    imputer = SimpleImputer(strategy="mean")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train RandomForest model
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=iris.target_names, yticklabels=iris.target_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    
    # Feature Importance
    feature_importance = model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(iris.feature_names, feature_importance)
    plt.title("Feature Importance")
    plt.xlabel("Importance")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    mlflow.log_artifact("feature_importance.png")
    
    # Save model as .pkl file
    model_filename = "RF_iris.pkl"
    joblib.dump(model, model_filename)
    mlflow.log_artifact(model_filename)
    
mlflow.end_run()




🏃 View run mercurial-bear-885 at: http://127.0.0.1:5000/#/experiments/2/runs/8808fd40cf664f659aa62c891b0e1c29
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


In [3]:
import requests

mlflow_url = "http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name"
experiment_name = "IRIS_PROVENANCE"

response = requests.get(mlflow_url, params={"experiment_name": experiment_name})

if response.status_code == 200:
    print(response.json())  # Returns experiment details
else:
    print("Error:", response.text)


{'experiment': {'experiment_id': '2', 'name': 'IRIS_PROVENANCE', 'artifact_location': 'file:///C:/Users/reema/mlruns/2', 'lifecycle_stage': 'active', 'last_update_time': 1743451061454, 'creation_time': 1743451061454}}


In [5]:
import requests
import json
import os

# MLflow Tracking Server URL
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

# Directory to save JSON file
output_dir = "mlflow_logs"
os.makedirs(output_dir, exist_ok=True)
json_filename = os.path.join(output_dir, "mlflow_experiments.json")


def get_experiments():
    """Fetch all MLflow experiments"""
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/experiments/list"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()["experiments"]
    return []


def get_runs(experiment_id):
    """Fetch all runs for a given experiment"""
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/search"
    response = requests.post(url, json={"experiment_ids": [experiment_id]})
    if response.status_code == 200:
        return response.json().get("runs", [])
    return []


def get_run_details(run_id):
    """Fetch details for a specific run"""
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/get"
    response = requests.get(url, params={"run_id": run_id})
    if response.status_code == 200:
        return response.json()["run"]
    return {}


def list_artifacts(run_id):
    """List all artifacts for a run"""
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/artifacts/list"
    response = requests.get(url, params={"run_id": run_id})
    if response.status_code == 200:
        return response.json().get("files", [])
    return []


def collect_all_data():
    """Collect all experiments, runs, and details"""
    all_data = {"experiments": []}
    
    experiments = get_experiments()
    for exp in experiments:
        exp_data = {
            "experiment_id": exp["experiment_id"],
            "name": exp["name"],
            "lifecycle_stage": exp["lifecycle_stage"],
            "runs": []
        }
        
        runs = get_runs(exp["experiment_id"])
        for run in runs:
            run_id = run["info"]["run_id"]
            run_details = get_run_details(run_id)
            artifacts = list_artifacts(run_id)

            run_data = {
                "run_id": run_id,
                "status": run["info"]["status"],
                "start_time": run["info"].get("start_time"),
                "end_time": run["info"].get("end_time"),
                "params": run_details["data"].get("params", {}),
                "metrics": run_details["data"].get("metrics", {}),
                "artifacts": [artifact["path"] for artifact in artifacts]
            }
            exp_data["runs"].append(run_data)
        
        all_data["experiments"].append(exp_data)

    return all_data


# Fetch and save data
mlflow_data = collect_all_data()

with open(json_filename, "w", encoding="utf-8") as f:
    json.dump(mlflow_data, f, indent=4)

print(f"MLflow experiment data saved to: {json_filename}")


MLflow experiment data saved to: mlflow_logs\mlflow_experiments.json


In [45]:
import requests

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
get_url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/experiments/get"

# Make the request with the experiment_id parameter
response = requests.get(get_url, params={"experiment_id": 2})

# Check if the request was successful (HTTP Status Code 200)
if response.status_code == 200:
    # Print the response as JSON
    print(response.json())
    folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
    file_name = "experiment_id_get_.json"  # Name of the file
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
    with open(file_path, 'w') as json_file:
        json.dump(response.json(), json_file, indent=4)
else:
    print("Error:", response.status_code, response.text)


{'experiment': {'experiment_id': '2', 'name': 'IRIS_PROVENANCE', 'artifact_location': 'file:///C:/Users/reema/mlruns/2', 'lifecycle_stage': 'active', 'last_update_time': 1743451061454, 'creation_time': 1743451061454}}


In [46]:
import requests
import json

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
get_url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/experiments/get"

# Make the request with the experiment_id parameter
response = requests.get(get_url, params={"experiment_id": 2})

# Check if the request was successful (HTTP Status Code 200)
if response.status_code == 200:
    # Get the response data in JSON format
    response_data = response.json()
    
    # Define the filename to save the response
    folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
    file_name = "experiment_get.json"  # Name of the file
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
    with open(file_path, 'w') as json_file:
        json.dump(response_data, json_file, indent=4)
    print(f"Data saved to {filename}")
else:
    print("Error:", response.status_code, response.text)


Data saved to ML_runs_test_experiment_data.json


In [40]:
import requests
import json

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"

# Get Experiment by Name
def get_experiment_by_name(experiment_name):
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/experiments/get-by-name"
    params = {"experiment_name": experiment_name}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
        file_name = "experiment_by_name.json"  # Name of the file
        file_path = os.path.join(folder_path, file_name)
        os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Experiment {experiment_name} saved.")
    else:
        print(f"Failed to get experiment {experiment_name}:", response.status_code)

# Example usage:
get_experiment_by_name("IRIS_PROVENANCE")  # Replace with your experiment name


Experiment IRIS_PROVENANCE saved.


In [47]:
import requests
import json

# Set the MLFlow tracking URI
MLFLOW_TRACKING_URI = "http://localhost:5000"

# API endpoint to search runs
url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/search"

# Create the payload (data)
payload = {
    "experiment_ids": ["2"]
}

# Set headers for JSON content
headers = {
    "Content-Type": "application/json"
}

# Make the POST request to the server
response = requests.post(url, json=payload, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the response JSON
    data = response.json()
    
    folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
    file_name = "run_search.json"  # Name of the file
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    print("Search results saved successfully.")
else:
    print(f"Failed to search runs. Status code: {response.status_code}")


Search results saved successfully.


In [48]:
import requests
import json

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/get"

# Replace with the actual run_id you want to fetch
run_id = "a39946f6de364ffd8b6c9a1c036e6c8b"

# Prepare the parameters (run_id)
params = {
    "run_id": run_id
}

# Send the request to fetch the run details
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    run_data = response.json()
    print(json.dumps(run_data, indent=4))
     # Ensure the directory exists
    folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
    file_name = "run_data.json"  # Name of the file
    file_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
    with open(file_path, 'w') as json_file:
        json.dump(run_data, json_file, indent=4)

else:
    print(f"Error: {response.status_code}")
    print(response.text)


{
    "run": {
        "info": {
            "run_uuid": "a39946f6de364ffd8b6c9a1c036e6c8b",
            "experiment_id": "2",
            "run_name": "placid-hog-117",
            "user_id": "reema",
            "status": "FINISHED",
            "start_time": 1743451062244,
            "end_time": 1743451069862,
            "artifact_uri": "file:///C:/Users/reema/mlruns/2/a39946f6de364ffd8b6c9a1c036e6c8b/artifacts",
            "lifecycle_stage": "active",
            "run_id": "a39946f6de364ffd8b6c9a1c036e6c8b"
        },
        "data": {
            "metrics": [
                {
                    "key": "training_precision_score",
                    "value": 1.0,
                    "timestamp": 1743451062464,
                    "step": 0
                },
                {
                    "key": "training_recall_score",
                    "value": 1.0,
                    "timestamp": 1743451062464,
                    "step": 0
                },
                {
    

In [50]:
# Get Run by ID
def get_run_by_id(run_id):
    url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/runs/get"
    params = {"run_id": run_id}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        with open(f"run_{run_id}.json", "w") as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Run {run_id} saved.")
    else:
        print(f"Failed to get run {run_id}:", response.status_code)

# Example usage:
get_run_by_id("a39946f6de364ffd8b6c9a1c036e6c8b")  # Replace with your run ID


Run a39946f6de364ffd8b6c9a1c036e6c8b saved.


In [51]:
import requests

MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/registered-models/search"

# Send a GET request to search for registered models
response = requests.get(url)

if response.status_code == 200:
    models_data = response.json()
    for model in models_data['registered_models']:
        print(f"Model Name: {model['name']}")
else:
    print(f"Error: {response.status_code}")
    print(response.text)


Model Name: tracking-quickstart


In [52]:
import requests
import json

# MLflow tracking URI
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
url = f"{MLFLOW_TRACKING_URI}/api/2.0/mlflow/registered-models/get"

# Replace with the actual model name you want to fetch
model_name = "tracking-quickstart"  # Replace with your registered model name

# Prepare the parameters (name)
params = {
    "name": model_name
}

# Send the request to fetch the registered model details
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    model_data = response.json()

    # Define the folder and file path where you want to save the JSON
    folder_path = "ML_RUNS_MANUAL_LOGGING"  # Replace with the desired folder path
    file_name = f"{model_name}_registered_model.json"  # Name of the file
    file_path = os.path.join(folder_path, file_name)

    # Ensure the directory exists
    os.makedirs(folder_path, exist_ok=True)

    # Save the response data to the file
    with open(file_path, 'w') as json_file:
        json.dump(model_data, json_file, indent=4)

    print(f"Registered model data saved to {file_path}")
else:
    print(f"Error: {response.status_code}")
    print(response.text)


Registered model data saved to ML_RUNS_MANUAL_LOGGING\tracking-quickstart_registered_model.json


File auto generation for db repo

In [55]:
import json
file_path
# Load JSON files
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Save JSON file
def save_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=2, ensure_ascii=False)
identifier = data.get("tables", [])[0].get("id") if data.get("tables") else None

# Field Mapping (Source Field -> Destination Field)
field_mapping = {
 
    "identifier": source_file.,
    "title": "",
    "description": "",
    "keywords": [],
    "creators": "",
    "created_date": "",
    "modified_date": "",
    "provenance": "",
    "access_rights": "",
    "license": "",
    "source": "",
    "relations": [],
    "storage_location": "",
    "metadata_schema": "",
    "format": "",
    "size": "",
    "checksum": "",
    "modification_history": "",
    "funding": "",
    "contributors": [],
    "derived_from": "",
    "is_version": "",
    "contact_point": "",
    "FAIR_compliance_score": "",
    "documentation": "",
    "reference_strategy": "",
    "dataset": {
      "collection_method": "",
      "data_structure": "",
      "num_records": "",
      "num_columns": "",
      "column_details": {
        "id": "",
        "datatype": "",
        "description": "",
        "size": "",
        "mean": "",
        "median": "",
        "standard_deviation": "",
        "is_null_allowed": ""
      },
      "categorical": {
        "unique_values": "",
        "most_frequent": ""
      }
    },
    "missing_data": {
      "total_missing_values": "",
      "missing_columns": [],
      "missing_handling": ""
    },
    "outliers": {
      "columns_with_outliers": [],
      "handling_method": ""
    },
    "sampling_strategy": "",
    "data_quality": "",
    "bias_check": "",
    "anonymization": "",
    "geospatial_info": "",
    "temporal_coverage": "",
    "usage_guidelines": "",
    "known_issues": "",
    "citations": "",
    "ethics_approval": ""
  }




# Recursive function to update destination JSON with values from source JSON
def update_json(target, source, mapping):
    for src_key, dest_key in mapping.items():
        if src_key in source:
            target_path = dest_key.split(".")  # Handle nested fields
            update_nested_field(target, target_path, source[src_key])

# Helper function to update nested fields
def update_nested_field(target, path, value):
    key = path[0]
    if len(path) == 1:
        target[key] = value
    else:
        if key not in target:
            target[key] = {}
        update_nested_field(target[key], path[1:], value)

# Paths
destination_file = "Autogenerated_json/auto_gen_metadata.json"
source_file = "Code_fetched_metadata_dbrep/DB_REPO_database_details.json"

# Load data
destination_data = load_json(destination_file)
source_data = load_json(source_file)

# Update destination JSON
update_json(destination_data["structured_repository_dbrepo"], source_data, field_mapping)

# Save the updated destination JSON
save_json(destination_file, destination_data)

print(f"Updated JSON saved to {destination_file}")


Updated JSON saved to Autogenerated_json/auto_gen_metadata.json


In [70]:
source_file = "Code_fetched_metadata_dbrep/DB_REPO_database_details.json"
source_data = load_json(source_file)
identifier = source_data.get("id")
# title = source_data.get("tables", [])[0].get("is_versioned") if data.get("tables") else None
title = source_data.get("tables")[0].get("columns")

print(f"Fetched identifier: {title}")

Fetched identifier: [{'id': '75811fa4-5d84-4e37-b256-09fafdb00f6a', 'name': 'Id', 'alias': None, 'size': None, 'd': None, 'mean': 76, 'median': 76, 'concept': None, 'unit': None, 'description': None, 'enums': [], 'sets': [], 'database_id': 'c3a42d17-42b7-43c9-a504-2363fb4c9c8d', 'table_id': '991f4e40-4524-4dd5-a94b-aa0bf4862f6f', 'ord': 0, 'internal_name': 'id', 'index_length': None, 'length': None, 'type': 'bigint', 'data_length': None, 'max_data_length': None, 'num_rows': None, 'val_min': None, 'val_max': None, 'std_dev': 43, 'is_null_allowed': False}, {'id': '92330917-0d50-44e4-acb2-bef47475c8ad', 'name': 'SepalLengthCm', 'alias': None, 'size': 40, 'd': 20, 'mean': 6, 'median': 6, 'concept': None, 'unit': None, 'description': None, 'enums': [], 'sets': [], 'database_id': 'c3a42d17-42b7-43c9-a504-2363fb4c9c8d', 'table_id': '991f4e40-4524-4dd5-a94b-aa0bf4862f6f', 'ord': 1, 'internal_name': 'sepallengthcm', 'index_length': None, 'length': None, 'type': 'decimal', 'data_length': None, '

In [58]:
import json

# Function to load a JSON file
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to save a JSON file
def save_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Source file path
source_file = "Code_fetched_metadata_dbrep/DB_REPO_database_details.json"

# Load source data to fetch the identifier
source_data = load_json(source_file)
identifier = source_data.get("id")  
print(f"Fetched identifier: {identifier}")

title = source_data.get("name")  
print(f"Fetched identifier: {title}")

description = source_data.get("description")  
print(f"Fetched identifier: {description}")

identifier = source_data.get("creators")  
print(f"Fetched identifier: {identifier}")

created_date = "Data not available" 
print(f"Fetched identifier: {created_date}")

##modified_date--from history data
##### provenance---> DOI???
##### license--> DOI?
# source: "Original source if derived from another dataset (PROV-O)",
# relations: ["Links to related datasets (FAIR-DATA, DataCite)"],
# storage_location: "Physical or digital location of dataset (FAIR-DATA)",
access_rights = source_data.get("accesses")  
print(f"Fetched identifier: {access_rights}")

metadata_schema ="DataCite Metadata Schema"
print(f"Fetched identifier: {metadata_schema}")

format = "csv/ tsv"
print(f"Fetched identifier: {format}")

is_versioned = source_data.get("tables")[0].get("is_versioned")
print(f"Fetched identifier: {is_versioned}")

contact_point = source_data.get("contact")  
print(f"Fetched identifier: {contact_point}")

num_columns = len(source_data.get("tables")[0].get("columns")) 
print(f"Fetched identifier: {num_columns}")

column_details = source_data.get("tables")[0].get("columns")  
print(f"Fetched identifier: {column_details}")

identifier = source_data.get("id")  
print(f"Fetched identifier: {identifier}")

identifier = source_data.get("id")  
print(f"Fetched identifier: {identifier}")


# Destination file path (where you want to map the value of 'identifier')
destination_file = "Autogenerated_json/auto_gen_metadata.json"

# Load destination data
destination_data = load_json(destination_file)

# Set the identifier value in the destination JSON (update the desired field)
destination_data['identifier'] = identifier


for key in destination_data.keys():
    # You can add conditions to update other fields if needed
    if isinstance(destination_data[key], dict):  # If the field is a dictionary, you can iterate further
        for sub_key in destination_data[key].keys():
            print(f"Key: {sub_key}, Value: {destination_data[key][sub_key]}")
    else:
        print(f"Key: {key}, Value: {destination_data[key]}")

# Save the modified destination JSON back to the file
save_json(destination_file, destination_data)

print(f"Updated {destination_file} with the new identifier.")


Fetched identifier: c3a42d17-42b7-43c9-a504-2363fb4c9c8d
Key: identifier, Value: 
Key: title, Value: 
Key: description, Value: 
Key: keywords, Value: []
Key: creators, Value: 
Key: created_date, Value: 
Key: modified_date, Value: 
Key: provenance, Value: 
Key: access_rights, Value: 
Key: license, Value: 
Key: source, Value: 
Key: relations, Value: []
Key: storage_location, Value: 
Key: metadata_schema, Value: 
Key: format, Value: 
Key: size, Value: 
Key: checksum, Value: 
Key: modification_history, Value: 
Key: funding, Value: 
Key: contributors, Value: []
Key: derived_from, Value: 
Key: is_version, Value: 
Key: contact_point, Value: 
Key: FAIR_compliance_score, Value: 
Key: documentation, Value: 
Key: reference_strategy, Value: 
Key: dataset, Value: {'collection_method': '', 'data_structure': '', 'num_records': '', 'num_columns': '', 'column_details': {'id': '', 'datatype': '', 'description': '', 'size': '', 'mean': '', 'median': '', 'standard_deviation': '', 'is_null_allowed': ''}, '

In [None]:
dbrepo, mlflow,jupyter, git hub refined and found the data source. 


    Next
    1. dbrepo file update and clean up the dtorage n fetching
2. github file update and clean up the dtorage n fetching
3. NLFLOW file update and clean up the dtorage n fetching
4. jupyter file update and clean up the dtorage n fetching

automate generation of file and make a lisr of invenio vs dbrepo
then refine invenio meta
auto gen file for invenio

clean up code

proposal
PPT
ans questions to solve
plan visualization
and mail
implement viz

start report

Extend to time series
redo experiment

Extend to DNN
redo experiment