## Provena Summaries

Query Provena APIs for summary information.

### Provena workflow configuration setup

In [10]:
# This is a small helper class which provides a config object for validation and
# a loader function
import example_workflow_config

# this contains helpers for interacting with the registry
import registry

# This is a helper function for managing authentication with Provena
import mdsisclienttools.auth.TokenManager as ProvenaAuth
import mdsisclienttools.datastore.ReadWriteHelper as ProvenaRW
from typing import Dict, Any, Callable


import json
import time
import requests

In [2]:
# Provena config - replace with your Provena instance endpoints

# Replace the domain with the domain of your Provena instance
PROVENA_DOMAIN = "dev.rrap-is.com"

# Edit this to point to the Keycloak instance for your Provena instance
kc_endpoint = "https://auth.dev.rrap-is.com/auth/realms/rrap"

stage = "DEV"
registry_endpoint = "https://registry-api.{}".format(PROVENA_DOMAIN)
provenance_endpoint = "https://prov-api.{}".format(PROVENA_DOMAIN)
data_store_endpoint = "https://data-api.{}".format(PROVENA_DOMAIN)
job_endpoint =  "https://job-api.{}".format(PROVENA_DOMAIN)

In [3]:
# If you would like to try using an offline token (API Key) you can do so by
# switching this to true AND ensuring the token is available in a file '.env'
# with format PROVENA_API_TOKEN="yourtokenhere"
import os
from dotenv import load_dotenv

offline_mode = False

if offline_mode:
    load_dotenv('.env')
    offline_token=os.getenv('PROVENA_API_TOKEN')
    assert offline_token, "Offline token must be present in .env file e.g. PROVENA_API_TOKEN=1234."
    print(f"Offline mode activated and token found in .env file.")

In [4]:
# sets up auth connections - could potentially open browser window if not signed
# in recently - caches in .tokens.json - ensure this is included in gitignore

if not offline_mode:
    provena_auth = ProvenaAuth.DeviceFlowManager(
        stage=stage,
        keycloak_endpoint=kc_endpoint,
        auth_flow=ProvenaAuth.AuthFlow.DEVICE,
    )
else:
    provena_auth = ProvenaAuth.DeviceFlowManager(
        stage=stage,
        keycloak_endpoint=kc_endpoint,
        # these setups allow for offline access
        
        # use the offline auth flow
        auth_flow=ProvenaAuth.AuthFlow.OFFLINE,
        
        # specify existing offline token
        offline_token=offline_token,
        
        # use the automated access client which accepts offline token workflows
        client_id='automated-access'
    )

# expose the get auth function which is used for provena methods 
get_auth = provena_auth.get_auth

No storage or object provided, using default location: .tokens.json.
Using storage type: FILE.
Using DEVICE auth flow.
Attempting to generate authorisation tokens.

Looking for existing tokens in local storage.

Validating found tokens

Trying to use found tokens to refresh the access token.

Token refresh successful.



In [5]:
def pprint_json(content) -> None:
    print(json.dumps(content,indent=2))
  
counts = {
    "datasets" : 0,
    "model_runs" : 0,
    "persons": 0,
}

# list  datasets 
payload={
  "filter_by": {
    "record_type": "COMPLETE_ONLY"
  },
  "sort_by": {
    "ascending": False
  },
  "page_size": 1000
}
list_datasets = registry.list_datasets(registry_endpoint=registry_endpoint, data=payload, auth=get_auth())
#pprint_json(list_datasets)
counts["datasets"] = list_datasets['total_item_count']


Fetching from registry using post


In [6]:
# list model_runs
payload={
  "filter_by": {
    "record_type": "COMPLETE_ONLY"
  },
  "sort_by": {
    "ascending": False
  },
  "page_size": 1000
}
list_model_runs = registry.list_model_runs(registry_endpoint=registry_endpoint, data=payload, auth=get_auth())
#pprint_json(list_model_runs)
counts["model_runs"] = list_model_runs['total_item_count']


Fetching from registry using post


In [19]:
def do_listing(key: str,  func: Callable, count_arr: Dict[str,int]) -> Dict[str, int]:
    res = func(registry_endpoint=registry_endpoint, data=payload, auth=get_auth())
    counts[key] = res['total_item_count']
    print(counts)
    return counts

counts = do_listing("persons", registry.list_persons, count_arr=counts)
counts = do_listing("organisations", registry.list_organisations, count_arr=counts)
counts = do_listing("model_run_workflow_templates", registry.list_model_run_workflow_templates, count_arr=counts)
counts = do_listing("dataset_templates", registry.list_dataset_templates, count_arr=counts)

Fetching from registry using post
{'datasets': 72, 'model_runs': 38, 'persons': 14, 'organisations': 7}
Fetching from registry using post
{'datasets': 72, 'model_runs': 38, 'persons': 14, 'organisations': 7}
Fetching from registry using post
{'datasets': 72, 'model_runs': 38, 'persons': 14, 'organisations': 7, 'model_run_workflow_templates': 11}
Fetching from registry using post
{'datasets': 72, 'model_runs': 38, 'persons': 14, 'organisations': 7, 'model_run_workflow_templates': 11, 'dataset_templates': 18}


In [20]:
counts

{'datasets': 72,
 'model_runs': 38,
 'persons': 14,
 'organisations': 7,
 'model_run_workflow_templates': 11,
 'dataset_templates': 18}

In [21]:
import pandas as pd

df = pd.DataFrame({"count" : pd.Series(counts)})
print(df)

                              count
datasets                         72
model_runs                       38
persons                          14
organisations                     7
model_run_workflow_templates     11
dataset_templates                18
