# Provena Client Library Workflow Guide and Example. 

This notebook contains guidance and examples on how to use the Provena Client Library with common Provena operations (create, fetch, model run lodge etc.)

The client library is an user friendly interface to interact with the various API's of Provena (Registry, Prov, Datastore, etc.) through code and is currently compatiable with the Python programming language.

To find further information or explore other Provena operations of the client library: https://provena.github.io/provena-python-client/

### Client Configuration and Initialisation

In [1]:
# Import initial modules needed.
from provenaclient import ProvenaClient, Config
from provenaclient.auth import DeviceFlow
from provenaclient.auth.implementations import OfflineFlow
from pprint import pprint


Instantiate the client library by providing the domain your Provena instance is hosted on and the name of your Keycloak realm. 

In [2]:
# Provena config - replace with your Provena instance endpoints
client_config = Config(
    domain="dev.rrap-is.com",
    realm_name="rrap"
)

offline_mode = False

if offline_mode:
    load_dotenv()
    offline_token=os.getenv('PROVENA_API_TOKEN')
    assert offline_token, "Offline token must be present in .env file e.g. PROVENA_API_TOKEN=1234."
    print(f"Offline mode activated and token found in .env file.")

if not offline_mode:
    auth = DeviceFlow(config=client_config,
                    client_id="client-tools")
else:
    auth = OfflineFlow(config=client_config, client_id="automated-access", offline_token=offline_token)


# Instantiate the client.
client = ProvenaClient(config=client_config, auth=auth)

2024-07-31 23:45:43,213 - auth-logger - ERROR - The token used for refresh is invalid or has potentially expired. Something went wrong during token refresh. Status code: 400.


Verification URL: https://auth.dev.rrap-is.com/auth/realms/rrap/device?user_code=MYIC-VLRE
User Code: MYIC-VLRE


## Querying Datastore API.

We will take a look at querying and interacting with the Datastore API exploring common operations of fetching dataset, minting dataset and fetching all datasets in various formats (paginated, all).

In [3]:

dataset = await client.datastore.fetch_dataset(id = "10378.1/1908974")

print(dataset) # Fetched dataset pythonic object.
print()
print("Dataset Query Details:", dataset.status.details) # Accessing fetched dataset query details
print()
print("Dataset Display Name:", dataset.item.display_name) # Accessing fetched dataset name

status=Status(success=True, details="Successfully fetched data for handle '10378.1/1908974'") item=ItemDataset(display_name='TEst', user_metadata=None, collection_format=CollectionFormat(associations=CollectionFormatAssociations(organisation_id='10378.1/1877551', data_custodian_id=None, point_of_contact=None), approvals=CollectionFormatApprovals(ethics_registration=DatasetEthicsRegistrationCheck(relevant=False, obtained=False), ethics_access=DatasetEthicsAccessCheck(relevant=False, obtained=False), indigenous_knowledge=IndigenousKnowledgeCheck(relevant=False, obtained=False), export_controls=ExportControls(relevant=False, obtained=False)), dataset_info=CollectionFormatDatasetInfo(name='TEst', description='TEst', access_info=AccessInfo(reposited=False, uri='http://google.com', description='test'), publisher_id='10378.1/1877551', created_date=CreatedDate(relevant=True, value=datetime.date(2024, 6, 5)), published_date=PublishedDate(relevant=True, value=datetime.date(2024, 6, 21)), license

In [None]:
from ProvenaInterfaces.RegistryModels import *

dataset_to_create = CollectionFormat(
        associations=CollectionFormatAssociations(
        organisation_id="10378.1/1893860",
        data_custodian_id="10378.1/1893843",
        point_of_contact= None
        ),
        approvals=CollectionFormatApprovals(
            ethics_registration = DatasetEthicsRegistrationCheck(relevant=False, obtained=False),
            ethics_access=DatasetEthicsAccessCheck(relevant= False, obtained= False),
            indigenous_knowledge=IndigenousKnowledgeCheck(relevant=False, obtained= False),
            export_controls=ExportControls(relevant=False, obtained=False)
        ),
        dataset_info=CollectionFormatDatasetInfo(
            name="Parth testing",
            description="testing dataset",
            access_info=AccessInfo(reposited=True, uri=None, description=None),
            publisher_id="10378.1/1893860",
            published_date=date.today(),
            license = "https://www.google.com", #type:ignore
            created_date=date.today(),
            purpose= None,
            rights_holder=None,
            usage_limitations=None,
            preferred_citation=None,
            formats = None,
            keywords= None,
            user_metadata= None,
            version = None
        )
    )

created_dataset = await client.datastore.mint_dataset(dataset_mint_info=dataset_to_create)

print("Created Dataset handle is:", created_dataset.handle)
print("Created Dataset reqeuest details:", created_dataset.status.details)

In [24]:
from ProvenaInterfaces.RegistryAPI import *

# Sort criteria to receive datasets.
sort_criteria = NoFilterSubtypeListRequest(
            sort_by=SortOptions(sort_type=SortType.DISPLAY_NAME, ascending=False, begins_with=None), 
            pagination_key=None, 
            page_size=10
        )


list_datasets = await client.datastore.list_datasets(list_dataset_request=sort_criteria)

for i in list_datasets:
    print(i)


('status', Status(success=True, details='Successfully listed items.'))
('items', [ItemDataset(display_name='WaveEcologyModel Output Grapes', user_metadata=None, collection_format=CollectionFormat(associations=CollectionFormatAssociations(organisation_id='10378.1/1875957', data_custodian_id=None, point_of_contact=None), approvals=CollectionFormatApprovals(ethics_registration=DatasetEthicsRegistrationCheck(relevant=False, obtained=False), ethics_access=DatasetEthicsAccessCheck(relevant=False, obtained=False), indigenous_knowledge=IndigenousKnowledgeCheck(relevant=False, obtained=False), export_controls=ExportControls(relevant=False, obtained=False)), dataset_info=CollectionFormatDatasetInfo(name='WaveEcologyModel Output Grapes', description='An opus of oceanic orchestration, where data motifs crescendo into a symphony of ecological enlightenment, harmonizing the cacophony of coral complexity into a serenade of predictive precision.', access_info=AccessInfo(reposited=True, uri=None, descr

In [26]:
# Getting all datasets in datastore with specified sort criteria.
all_datasets = await client.datastore.list_all_datasets(sort_criteria=sort_criteria)
pprint(all_datasets)

[ItemDataset(display_name='TEst', user_metadata=None, collection_format=CollectionFormat(associations=CollectionFormatAssociations(organisation_id='10378.1/1877551', data_custodian_id=None, point_of_contact=None), approvals=CollectionFormatApprovals(ethics_registration=DatasetEthicsRegistrationCheck(relevant=False, obtained=False), ethics_access=DatasetEthicsAccessCheck(relevant=False, obtained=False), indigenous_knowledge=IndigenousKnowledgeCheck(relevant=False, obtained=False), export_controls=ExportControls(relevant=False, obtained=False)), dataset_info=CollectionFormatDatasetInfo(name='TEst', description='TEst', access_info=AccessInfo(reposited=False, uri='http://google.com', description='test'), publisher_id='10378.1/1877551', created_date=datetime.date(2024, 6, 5), published_date=datetime.date(2024, 6, 21), license=AnyHttpUrl('https://gbrrestoration.github.io/rrap-mds-knowledge-hub/information-system/licenses.html#copyright-all-rights-reserved-', ), purpose=None, rights_holder=No

In [28]:
# Get a total number of datasets (specified limit) with provided sort criteria. 
async for dataset in client.datastore.for_all_datasets(list_dataset_request=sort_criteria, total_limit=30):
    pprint(dataset)


ItemDataset(display_name='Parth testing', user_metadata=None, collection_format=CollectionFormat(associations=CollectionFormatAssociations(organisation_id='10378.1/1893860', data_custodian_id='10378.1/1893843', point_of_contact=None), approvals=CollectionFormatApprovals(ethics_registration=DatasetEthicsRegistrationCheck(relevant=False, obtained=False), ethics_access=DatasetEthicsAccessCheck(relevant=False, obtained=False), indigenous_knowledge=IndigenousKnowledgeCheck(relevant=False, obtained=False), export_controls=ExportControls(relevant=False, obtained=False)), dataset_info=CollectionFormatDatasetInfo(name='Parth testing', description='testing dataset', access_info=AccessInfo(reposited=True, uri=None, description=None), publisher_id='10378.1/1893860', created_date=datetime.date(2024, 5, 28), published_date=datetime.date(2024, 5, 28), license=AnyHttpUrl('https://www.google.com', ), purpose=None, rights_holder=None, usage_limitations=None, preferred_citation=None, spatial_info=None, t

## Querying Provenance API.

#### We will now take a look at exploring some of the common operations of the PROV-API with existing and valid entities. 

Exploring Lineage

In [4]:
# Upstream

print("Exploring upstream query")

upstream_result = await client.prov_api.explore_upstream(starting_id="10378.1/1904964")
pprint(upstream_result)
print()
pprint(upstream_result.graph.get('nodes'))

print()

print("Exploring downstream query")

downstream_result = await client.prov_api.explore_downstream(starting_id="10378.1/1904961")
pprint(downstream_result)
print()
pprint(downstream_result.graph.get('nodes'))


Exploring upstream query
LineageResponse(status=Status(success=True, details='Made lineage query (with depth 3) to neo4j backend.'), record_count=5, graph={'directed': True, 'multigraph': False, 'graph': {}, 'nodes': [{'item_category': 'ENTITY', 'item_subtype': 'DATASET', 'id': '10378.1/1904964'}, {'item_category': 'ACTIVITY', 'item_subtype': 'CREATE', 'id': '10378.1/1904975'}, {'item_category': 'AGENT', 'item_subtype': 'PERSON', 'id': '10378.1/1893843'}, {'item_category': 'ENTITY', 'item_subtype': 'DATASET_TEMPLATE', 'id': '10378.1/1905250'}, {'item_category': 'ACTIVITY', 'item_subtype': 'CREATE', 'id': '10378.1/1905252'}], 'links': [{'type': 'wasGeneratedBy', 'source': '10378.1/1904964', 'target': '10378.1/1904975'}, {'type': 'wasAttributedTo', 'source': '10378.1/1904964', 'target': '10378.1/1893843'}, {'type': 'wasInfluencedBy', 'source': '10378.1/1904964', 'target': '10378.1/1905250'}, {'type': 'wasAssociatedWith', 'source': '10378.1/1904975', 'target': '10378.1/1893843'}, {'type':

In [9]:
# Contributing and Effected Datasets

contributing_datasets = await client.prov_api.get_contributing_datasets(starting_id = "10378.1/1904964")
print("Contributing datasets " + "\n", contributing_datasets)

effected_datasets = await client.prov_api.get_effected_datasets(starting_id = "10378.1/1904964")
print("Effected datasets " + "\n", effected_datasets)

Contributing datasets 
 status=Status(success=True, details='Made upstream contribution query (with depth 3) to neo4j backend.') record_count=0 graph={'directed': True, 'multigraph': False, 'graph': {}, 'nodes': [], 'links': []}
Effected datasets 
 status=Status(success=True, details='Made downstream effect query (with depth 3) to neo4j backend.') record_count=5 graph={'directed': True, 'multigraph': False, 'graph': {}, 'nodes': [{'item_category': 'ENTITY', 'item_subtype': 'DATASET', 'id': '10378.1/1904961'}, {'item_category': 'ACTIVITY', 'item_subtype': 'MODEL_RUN', 'id': '10378.1/1926259'}, {'item_category': 'ENTITY', 'item_subtype': 'DATASET', 'id': '10378.1/1904964'}, {'item_category': 'ACTIVITY', 'item_subtype': 'MODEL_RUN', 'id': '10378.1/1926270'}, {'item_category': 'ACTIVITY', 'item_subtype': 'MODEL_RUN', 'id': '10378.1/1926271'}], 'links': [{'type': 'wasGeneratedBy', 'source': '10378.1/1904961', 'target': '10378.1/1926259'}, {'type': 'wasGeneratedBy', 'source': '10378.1/190496

Lodging Model Runs && Querying with Job-API

In [12]:
from ProvenaInterfaces.ProvenanceAPI import ModelRunRecord, TemplatedDataset, DatasetType, AssociationInfo
from ProvenaInterfaces.AsyncJobAPI import JobStatus

# Building the Model Run Payload.
model_run_payload = ModelRunRecord(
    workflow_template_id=config.workflow_configuration.workflow_template,
    model_version = None, 
    inputs = [
        TemplatedDataset(
            dataset_template_id=config.inputs.input_dataset_template, 
            dataset_id=config.inputs.input_dataset,
            dataset_type=DatasetType.DATA_STORE
        )
    ], 
    outputs=[
        TemplatedDataset(
            dataset_template_id=config.outputs.output_dataset_template, 
            dataset_id=config.outputs.output_dataset,
            dataset_type=DatasetType.DATA_STORE
        )
    ], 
    annotations=None,
    display_name="Notebook Model Run Testing",
    description="Standard Provena Model Run Example",
    study_id=None,
    associations=AssociationInfo(
        modeller_id=config.associations.person,
        requesting_organisation_id=config.associations.organisation
    ),
    start_time=0,
    end_time=1

)


In [13]:
# Registering Model Run
model_run_register_result = await client.prov_api.register_model_run(model_run_payload=model_run_payload)


In [14]:
# Check the response of the model run registration
print("Status of registration", model_run_register_result.status)
print("Job Session ID", model_run_register_result.session_id)


# Check the job to see if it's complete. We will do this by polling the job_api
job_result = await client.job_api.await_successful_job_completion(session_id=model_run_register_result.session_id)

while job_result.status != JobStatus.SUCCEEDED: # Keep polling on this cell till this turns to "SUCCEEDED"
    
    job_result = await client.job_api.await_successful_job_completion(session_id=model_run_register_result.session_id)
    pprint(job_result.result)
    pprint(job_result.job_type)


print()
print("Current job status:", job_result.status) 

Status of registration success=True details='Job dispatched, monitor session ID using the job API to see progress.'
Job Session ID a76fdebf-b70d-48c9-a54c-6e979161abfb


## Querying Registry API.

We will take a look at creating various entities with different subtypes (org, model) and then fetching those newly created entities through the client library. 

In [None]:
# Organisation 
org_domain_info = OrganisationDomainInfo(
    display_name="Test org",
    name="Test org",
    ror="http://example.org/test-org", #type:ignore
    user_metadata={
        "my custom": "annotation",
        "another custom": "annotation"
    }
)
created_organisation = await client.registry.organisation.create_item(create_item_request=org_domain_info)
print("Created Organisation", created_organisation)

# Model 
model_domain_info = ModelDomainInfo(
    display_name="Example model",
    name="Example model",
    description="This is a fake model",
    documentation_url="https://example_model.org", #type:ignore
    source_url="https://example_model.org", #type:ignore
    user_metadata={
        "my custom": "annotation",
        "another custom": "annotation"
    }
)
created_model = await client.registry.model.create_item(create_item_request=model_domain_info)
print("Created Model", created_model)


# Fetching items...
fetched_org = await client.registry.organisation.fetch(id = created_organisation.created_item.id)
print("Fetched Organisation", fetched_org)

fetched_model = await client.registry.model.fetch(id = created_model.created_item.id)
print("Fetched model", fetched_model)

We will take a look at listing all items present in registry based on their subtypes (Organisation, Model) for this example. 

In [22]:
from ProvenaInterfaces.RegistryAPI import GeneralListRequest

general_list_request = GeneralListRequest(
    filter_by=None,
    sort_by=None,
    pagination_key=None
)

list_org = await client.registry.organisation.list_items(list_items_payload=general_list_request)

for item in list_org.items:
    print(item, "\n")



display_name='Test org' user_metadata={'another custom': 'annotation', 'my custom': 'annotation'} name='Test org' ror=AnyHttpUrl('http://example.org/test-org', ) history=[HistoryEntry[OrganisationDomainInfo](id=0, timestamp=1720676140, reason='Initial record creation', username='parth', item=OrganisationDomainInfo(display_name='Test org', name='Test org', ror=AnyHttpUrl('http://example.org/test-org', ), user_metadata={'another custom': 'annotation', 'my custom': 'annotation'}))] id='10378.1/1925648' owner_username='parth' created_timestamp=1720676140 updated_timestamp=1720676140 item_category=<ItemCategory.AGENT: 'AGENT'> item_subtype=<ItemSubType.ORGANISATION: 'ORGANISATION'> record_type=<RecordType.COMPLETE_ITEM: 'COMPLETE_ITEM'> workflow_links=None versioning_info=None 

display_name='Test org' user_metadata={'another custom': 'annotation', 'my custom': 'annotation'} name='Test org' ror=AnyHttpUrl('http://example.org/test-org', ) history=[HistoryEntry[OrganisationDomainInfo](id=0, t

In [23]:
list_models = await client.registry.model.list_items(list_items_payload=general_list_request)

for item in list_models.items:
    print(item, "\n")



display_name='ADRIA (Test)' user_metadata={'git_release': '', 'git_branch': 'provena-update', 'git_hash': 'e1cba08dddafc9df47523676b43869beceb6c68f'} name='ADRIA (Test)' description='ADRIA (Test) for testing GitHub integration.' documentation_url=AnyHttpUrl('https://github.com/open-AIMS/ADRIA.jl', ) source_url=AnyHttpUrl('https://github.com/open-AIMS/ADRIA.jl', ) history=[HistoryEntry[ModelDomainInfo](id=1, timestamp=1721804016, reason='Updating metadata attributes to specify new git hash, branch and release if available.', username='adria-bot', item=ModelDomainInfo(display_name='ADRIA (Test)', name='ADRIA (Test)', description='ADRIA (Test) for testing GitHub integration.', documentation_url=AnyHttpUrl('https://github.com/open-AIMS/ADRIA.jl', ), source_url=AnyHttpUrl('https://github.com/open-AIMS/ADRIA.jl', ), user_metadata={'git_release': '', 'git_branch': 'provena-update', 'git_hash': 'e1cba08dddafc9df47523676b43869beceb6c68f'})), HistoryEntry[ModelDomainInfo](id=0, timestamp=1721804

General Registry Actions (Fetching without subtype, Listing All Registry Items and Count of all items in registry (client library special))

In [29]:
# Fetching without subtype. 
fetch_result = await client.registry.general_fetch_item(id = "10378.1/1876000")
for item in fetch_result.item: 
    print("Fetched item", item)


Fetched item s3
Fetched item versioning_info
Fetched item release_history
Fetched item owner_username
Fetched item workflow_links
Fetched item updated_timestamp
Fetched item release_status
Fetched item item_category
Fetched item display_name
Fetched item collection_format
Fetched item history
Fetched item item_subtype
Fetched item created_timestamp
Fetched item id
Fetched item universal_partition_key
Fetched item record_type


In [30]:
# Listing all registry items. 
all_general_registry_items = await client.registry.list_general_registry_items(general_list_request=general_list_request)
for item in all_general_registry_items.items: 
    print(item)


{'owner_username': 'parth', 'prov_serialisation': '{"prefix": {"default": "http://hdl.handle.net/"}, "activity": {"10378.1/1935470": {"model_run/10378.1/1935470": true, "item_category": "ACTIVITY", "item_subtype": "MODEL_RUN"}}, "entity": {"10378.1/1904964": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "DATASET"}, "10378.1/1904961": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "DATASET"}, "10378.1/1905251": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "MODEL_RUN_WORKFLOW_TEMPLATE", "prov:type": {"$": "prov:Collection", "type": "prov:QUALIFIED_NAME"}}, "10378.1/1905250": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "DATASET_TEMPLATE"}, "10378.1/1926245": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "DATASET_TEMPLATE"}, "10378.1/1924630": {"model_run/10378.1/1935470": true, "item_category": "ENTITY", "item_subtype": "MOD

In [33]:
# Count of all items based on subtypes. 
count_of_all_items = await client.registry.list_registry_items_with_count()
print(count_of_all_items)

{'MODEL_RUN': 112, 'VERSION': 21, 'MODEL': 26, 'CREATE': 258, 'PERSON': 29, 'DATASET': 210, 'MODEL_RUN_WORKFLOW_TEMPLATE': 7, 'DATASET_TEMPLATE': 14, 'ORGANISATION': 17, 'STUDY': 4}
