# Provena Dataset Query

This notebook demonstrates how a Provena user can query datasets in the Provena Data Store. This will focus on these use cases:
- Finding a dataset by name or title 
- Fetching a dataset metadata record and assembling its representation in JSON
- Exporting a dataset metadata record in a different format




## Configuration Set up

In [1]:
# This is a small helper class which provides a config object for validation and
# a loader function
import example_workflow_config

# this contains helpers for interacting with the registry
import registry

import search


# This is a helper function for managing authentication with Provena
import mdsisclienttools.auth.TokenManager as ProvenaAuth


import json
import time
import requests

In [2]:
# Provena config - replace with your Provena instance endpoints

# Replace the domain with the domain of your Provena instance
PROVENA_DOMAIN = "dev.rrap-is.com"

# Edit this to point to the Keycloak instance for your Provena instance
kc_endpoint = "https://auth.dev.rrap-is.com/auth/realms/rrap"

stage = "DEV"
registry_endpoint = "https://registry-api.{}".format(PROVENA_DOMAIN)
provenance_endpoint = "https://prov-api.{}".format(PROVENA_DOMAIN)
data_store_endpoint = "https://data-api.{}".format(PROVENA_DOMAIN)
job_endpoint =  "https://job-api.{}".format(PROVENA_DOMAIN)
search_endpoint = "https://search.{}".format(PROVENA_DOMAIN)

In [3]:
# sets up auth connections - could potentially open browser window if not signed
# in recently - caches in .tokens.json - ensure this is included in gitignore
provena_auth = ProvenaAuth.DeviceFlowManager(
    stage=stage,
    keycloak_endpoint=kc_endpoint
)

# expose the get auth function which is used for provena methods 
get_auth = provena_auth.get_auth

No storage or object provided, using default location: .tokens.json.
Using storage type: FILE.
Using DEVICE auth flow.
Attempting to generate authorisation tokens.

Looking for existing tokens in local storage.

Validating found tokens

Trying to use found tokens to refresh the access token.

Token refresh successful.



## 1. Finding a dataset by name or title, then get it's metadata record

- Finding a dataset by name or title 
- Fetching a dataset metadata record and assembling its representation in JSON

Issue a search query. This returns the ID and the relevance score for each search result.

We then iterate over each search result's ID and fetch the dataset metadata.


In [4]:
# let's establish the paths of the input from the dataset
def pprint_json(content) -> None:
    print(json.dumps(content,indent=2))

# fetch the dataset 
q = "flood"

search_results = search.search_dataset(search_endpoint=search_endpoint, registry_endpoint=registry_endpoint, query=q, subtype_filter="DATASET", record_limit="100", auth=get_auth())
pprint_json(search_results)

Fetching from registry, id: 10378.1/1759047...
Fetching from registry, id: 10378.1/1759043...
Fetching from registry, id: 10378.1/1759051...
Fetching from registry, id: 10378.1/1759045...
Fetching from registry, id: 10378.1/1759053...
Fetching from registry, id: 10378.1/1759037...
Fetching from registry, id: 10378.1/1759039...
Fetching from registry, id: 10378.1/1759049...
Fetching from registry, id: 10378.1/1759041...
[
  {
    "id": "10378.1/1759047",
    "score": 7.4073105,
    "dataset_metadata": {
      "display_name": "Merged Murray Darling Basin (MDB) FwDET flood depth estimates",
      "collection_format": {
        "associations": {
          "organisation_id": "10378.1/1758950",
          "data_custodian_id": null,
          "point_of_contact": null
        },
        "approvals": {
          "ethics_registration": {
            "relevant": false,
            "obtained": false
          },
          "ethics_access": {
            "relevant": false,
            "obtained": fal

# 2. Transform the first result to a different format


In [9]:
#get the first result
dataset_record = search_results[0]['dataset_metadata']


In [10]:
# Try exporting to YAML
import yaml

print(yaml.dump(dataset_record, default_flow_style=False))


collection_format:
  approvals:
    ethics_access:
      obtained: false
      relevant: false
    ethics_registration:
      obtained: false
      relevant: false
    export_controls:
      obtained: false
      relevant: false
    indigenous_knowledge:
      obtained: false
      relevant: false
  associations:
    data_custodian_id: null
    organisation_id: 10378.1/1758950
    point_of_contact: null
  dataset_info:
    access_info:
      description: null
      reposited: true
      uri: null
    created_date: '2023-09-25'
    description: "Merged outputs of MDB FwDET flood depth estimation workflow collated\
      \ from per region outputs. \n\nSee https://github.com/csiro-hydroinformatics/water-depth-estimation\
      \ and https://doi.org/10.1038/s41597-023-02559-4 for more information about\
      \ process details."
    formats: null
    keywords: null
    license: https://gbrrestoration.github.io/rrap-mds-knowledge-hub/information-system/licenses.html#copyright-all-rights-res

## 2.1 Try outputting a Word Document


In [43]:
# Setting up the Doc python code

import docx

#This is only needed if you're using the builtin style above
def get_or_create_hyperlink_style(d):
    """If this document had no hyperlinks so far, the builtin
       Hyperlink style will likely be missing and we need to add it.
       There's no predefined value, different Word versions
       define it differently.
       This version is how Word 2019 defines it in the
       default theme, excluding a theme reference.
    """
    if "Hyperlink" not in d.styles:
        if "Default Character Font" not in d.styles:
            ds = d.styles.add_style("Default Character Font",
                                    docx.enum.style.WD_STYLE_TYPE.CHARACTER,
                                    True)
            ds.element.set(docx.oxml.shared.qn('w:default'), "1")
            ds.priority = 1
            ds.hidden = True
            ds.unhide_when_used = True
            del ds
        hs = d.styles.add_style("Hyperlink",
                                docx.enum.style.WD_STYLE_TYPE.CHARACTER,
                                True)
        hs.base_style = d.styles["Default Character Font"]
        hs.unhide_when_used = True
        hs.font.color.rgb = docx.shared.RGBColor(0x05, 0x63, 0xC1)
        hs.font.underline = True
        del hs

    return "Hyperlink"

def add_hyperlink(paragraph, text, url):
    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a new run object (a wrapper over a 'w:r' element)
    new_run = docx.text.run.Run(
        docx.oxml.shared.OxmlElement('w:r'), paragraph)
    new_run.text = text

    # Set the run's style to the builtin hyperlink style, defining it if necessary
    new_run.style = get_or_create_hyperlink_style(part.document)
    # Alternatively, set the run's formatting explicitly
    # new_run.font.color.rgb = docx.shared.RGBColor(0, 0, 255)
    # new_run.font.underline = True

    # Join all the xml elements together
    hyperlink.append(new_run._element)
    paragraph._p.append(hyperlink)
    return hyperlink

def create_word_doc_for_dataset_record(dataset_record, title, filename):
    doc = docx.Document()

    # Add a Title to the document
    doc.add_heading(title, 0)
    data=[]
        
    arr = dataset_record['collection_format']['dataset_info']
    for key in arr.keys(): 
        if key == "access_info":
            continue
        data.append((key, arr[key]))
    
    # Creating a table object
    table = doc.add_table(rows=1, cols=2)
    
    # Adding heading in the 1st row of the table
    row = table.rows[0].cells
    row[0].text = 'Attribute'
    row[1].text = 'Value'

    row = table.add_row().cells
    row[0].text = "Dataset ID"
    #row[1].text = dataset_record['id']

    #p = doc.add_paragraph()
    p = row[1].paragraphs[0]
    add_hyperlink(p, dataset_record['id'], "{}/{}".format("https://hdl.handle.net/", dataset_record['id']))

    # Adding data from the list to the table
    for attr, val in data:
        if attr == "access_info":
            continue
        # Adding a row and then adding data in it.
        row = table.add_row().cells
        # Converting id to string as table can only take string input
        row[0].text = str(attr)

        if val == None:
            row[1].text = ""
        else: 
            row[1].text = val
    
    # Now save the document to a location
    doc.save(filename)

In [45]:
create_word_doc_for_dataset_record(dataset_record, "Dataset example", "test2.docx")