# Harmony EOSS ZARR API Tutorial

In [None]:
# !{sys.executable} -m pip install zarr harmony-py curlify pystac fsspec boto3 s3fs

## Before you start
Before you beginning this tutorial, make sure you have an account in the Earthdata Login UAT or Production environment, which 
will be used for this notebook by visiting [https://uat.urs.earthdata.nasa.gov](https://uat.urs.earthdata.nasa.gov).
These accounts, as all Earthdata Login accounts, are free to create and only take a moment to set up.

## Set Up Authentication

We need some boilerplate up front to log in to Earthdata Login.  The function below will allow Python
scripts to log into any Earthdata Login application programmatically.  To avoid being prompted for
credentials every time you run and also allow clients such as curl to log in, you can add the following
to a `.netrc` (`_netrc` on Windows) file in your home directory:

```
machine uat.urs.earthdata.nasa.gov
    login <your username>
    password <your password>
    
machine urs.earthdata.nasa.gov
    login <your username>
    password <your password>
```

Make sure that this file is only readable by the current user or you will receive an error stating
"netrc access too permissive."

`$ chmod 0600 ~/.netrc` 


In [5]:
from urllib import request
from http.cookiejar import CookieJar
import getpass
import netrc
import json
import requests
import sys
import shutil
import harmony
import zarr
import pystac
from IPython.display import display, JSON


def setup_earthdata_login_auth(endpoint):
    """
    Set up the request library so that it authenticates against the given Earthdata Login
    endpoint and is able to track cookies between requests.  This looks in the .netrc file 
    first and if no credentials are found, it prompts for them.

    Valid endpoints include:
        uat.urs.earthdata.nasa.gov - Earthdata Login UAT (Harmony's current default)
        urs.earthdata.nasa.gov - Earthdata Login production
    """
    try:
        username, _, password = netrc.netrc().authenticators(endpoint)
    except (FileNotFoundError, TypeError):
        # FileNotFound = There's no .netrc file
        # TypeError = The endpoint isn't in the netrc file, causing the above to try unpacking None
        print('Please provide your Earthdata Login credentials to allow data access')
        print('Your credentials will only be passed to %s and will not be exposed in Jupyter' % (endpoint))
        username = input('Username:')
        password = getpass.getpass()

    manager = request.HTTPPasswordMgrWithDefaultRealm()
    manager.add_password(None, endpoint, username, password)
    auth = request.HTTPBasicAuthHandler(manager)

    jar = CookieJar()
    processor = request.HTTPCookieProcessor(jar)
    opener = request.build_opener(auth, processor)
    request.install_opener(opener)


# GET TOKEN FROM CMR 
def get_token( url: str,client_id: str, user_ip: str,endpoint: str) -> str:
    try:
        token: str = ''
        username, _, password = netrc.netrc().authenticators(endpoint)
        xml: str = """<?xml version='1.0' encoding='utf-8'?>
        <token><username>{}</username><password>{}</password><client_id>{}</client_id>
        <user_ip_address>{}</user_ip_address></token>""".format(username, password, client_id, user_ip)
        headers: Dict = {'Content-Type': 'application/xml','Accept': 'application/json'}
        resp = requests.post(url, headers=headers, data=xml)
        
        response_content: Dict = json.loads(resp.content)
        token = response_content['token']['id']
    except:
        print("Error getting the token - check user name and password", sys.exc_info()[0])
    return token

### Setup Environment Parameters



In [6]:
venue = 'prod'

In [7]:
# Defaults
cmr_root = 'cmr.earthdata.nasa.gov'
harmony_client = harmony.Client(env=harmony.Environment.PROD)
edl_root = 'urs.earthdata.nasa.gov'
zarr_service_id = 'S2009180097-POCLOUD'

In [8]:
if venue == 'uat':
    cmr_root = 'cmr.uat.earthdata.nasa.gov'
    harmony_client = harmony.Client(env=harmony.Environment.UAT)
    edl_root = 'uat.urs.earthdata.nasa.gov'
    zarr_service_id = 'TBD'

print ("Environments: ")
print ("\t" + cmr_root)
print ("\t" + harmony_client.config.root_url)
print ("\t" + edl_root)
print ("\t" + zarr_service_id)

Environments: 
	cmr.earthdata.nasa.gov
	https://harmony.earthdata.nasa.gov
	urs.earthdata.nasa.gov
	S2009180097-POCLOUD


Now call the above function to set up Earthdata Login for subsequent requests

In [9]:
setup_earthdata_login_auth(edl_root)
token_url="https://"+cmr_root+"/legacy-services/rest/tokens"
token=get_token(token_url,'jupyter', '127.0.0.1',edl_root)

##  Find POCLOUD collections associated to Zarr service

This cell searches CMR for all POCLOUD collections associated to the Zarr reformatter service. For each collection it then searches for a single granule that can be used to try requesting as Zarr.

In [10]:
cmr_url = "https://"+cmr_root+"/search/collections.umm_json?service_concept_id="+zarr_service_id
response = requests.get(cmr_url)
response.raise_for_status()
response = response.json()
collections=[item['meta']['concept-id'] for item in response['items']]

collection_granule = []
for c in collections:
    cmr_url = "https://"+cmr_root+"/search/granules.umm_json"
    response = requests.post(cmr_url, data={'collection_concept_id': c, 'page_size':1})
    response.raise_for_status()
    response = response.json()
    collection_granule += [(c, response['items'][0]['meta']['concept-id'])]
collection_granule

[('C1996881862-POCLOUD', 'G2028350353-POCLOUD'),
 ('C1990404801-POCLOUD', 'G1991231042-POCLOUD'),
 ('C1991543823-POCLOUD', 'G1992649875-POCLOUD'),
 ('C1990404814-POCLOUD', 'G1991243029-POCLOUD'),
 ('C1991543805-POCLOUD', 'G1992672377-POCLOUD'),
 ('C1991543818-POCLOUD', 'G1997506185-POCLOUD'),
 ('C1991543733-POCLOUD', 'G1996984837-POCLOUD'),
 ('C1990404807-POCLOUD', 'G1991274267-POCLOUD'),
 ('C1991543824-POCLOUD', 'G1995305376-POCLOUD'),
 ('C1990404805-POCLOUD', 'G1991290412-POCLOUD')]

## Send Harmony requests

Send a Harmony request for each collection, granule tuple that should convert the requested granule to Zarr

In [11]:
def send_request(args):
    print(args)
    collection, granule = args
    request = harmony.Request(
        collection=harmony.Collection(id=collection),
        format='application/x-zarr',
        granule_id=granule
    )
    assert request.is_valid()
    print(harmony_client.request_as_curl(request))
    job_id = harmony_client.submit(request)
    return collection, granule, job_id

collection_granule_jobid = list(map(send_request, collection_granule))
collection_granule_jobid

('C1996881862-POCLOUD', 'G2028350353-POCLOUD')
curl -X GET -H 'Accept: */*' -H 'Accept-Encoding: gzip, deflate, br' -H 'Connection: keep-alive' -H 'Cookie: urs_user_already_logged=yes; token=*****; _urs-gui_session=c04c1e17fc8f8d53c97a5b7d23ef11c3' -H 'User-Agent: harmony-py/0.3.0 python-requests/2.26.0 Linux/3.10.0-1160.21.1.el7.x86_64 CPython/3.9.0' 'https://harmony.earthdata.nasa.gov/C1996881862-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G2028350353-POCLOUD&format=application%2Fx-zarr'
('C1990404801-POCLOUD', 'G1991231042-POCLOUD')
curl -X GET -H 'Accept: */*' -H 'Accept-Encoding: gzip, deflate, br' -H 'Connection: keep-alive' -H 'Cookie: urs_user_already_logged=yes; token=*****; _urs-gui_session=c04c1e17fc8f8d53c97a5b7d23ef11c3' -H 'User-Agent: harmony-py/0.3.0 python-requests/2.26.0 Linux/3.10.0-1160.21.1.el7.x86_64 CPython/3.9.0' 'https://harmony.earthdata.nasa.gov/C1990404801-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverag

[('C1996881862-POCLOUD',
  'G2028350353-POCLOUD',
  '3b4d98f1-0da5-4d61-8e0f-abbf4b4fb479'),
 ('C1990404801-POCLOUD',
  'G1991231042-POCLOUD',
  '82a16e70-c5e2-48dc-8e0f-3a683b894162'),
 ('C1991543823-POCLOUD',
  'G1992649875-POCLOUD',
  '18e2e952-8054-4138-ba1f-5c6db2ff22e3'),
 ('C1990404814-POCLOUD',
  'G1991243029-POCLOUD',
  '60abfa28-094c-4048-9bc7-45eb57075d2e'),
 ('C1991543805-POCLOUD',
  'G1992672377-POCLOUD',
  '32b30b30-0cc4-413e-8893-17e8cb961600'),
 ('C1991543818-POCLOUD',
  'G1997506185-POCLOUD',
  'a90649aa-1466-4b69-96a0-e4f4e63d13b0'),
 ('C1991543733-POCLOUD',
  'G1996984837-POCLOUD',
  'bc7d94d7-b4e1-450f-bdb0-9c9fe738ea11'),
 ('C1990404807-POCLOUD',
  'G1991274267-POCLOUD',
  'ed826f9a-2aef-4786-87d5-f20d4faa4aeb'),
 ('C1991543824-POCLOUD',
  'G1995305376-POCLOUD',
  '304f728c-a7c4-4e31-9312-e57e5ca2530d'),
 ('C1990404805-POCLOUD',
  'G1991290412-POCLOUD',
  '9fa0cc27-4a1f-4430-aabb-b064a03b9bab')]

## Wait for results

Wait for all Harmony jobs to complete

In [12]:
def get_results(args):
    collection, granule, job_id = args
    harmony_client.wait_for_processing(job_id)
    job_result = harmony_client.result_json(job_id)
    display(JSON(job_result))
    return collection, granule, job_id, job_result

collection_granule_jobid_resultjson = list(map(get_results, collection_granule_jobid))

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

<IPython.core.display.JSON object>

## Start verifying zarr was returned

Now we inspect each job result using `pystac`. We open the catalog returned from the Harmony job and check how many assets are present. We would expect there to be one for each request.

In [14]:
# Make sure we can navigate the catalog for each result
for collection, granule, job_id, result_json in collection_granule_jobid_resultjson:
    cat = pystac.Catalog.from_file([l['href'] for l in result_json['links'] if l['rel'] == 'stac-catalog-json'][0])
    print(cat.description)
    print('Contains {} items.'.format(len(cat.get_item_links())))
    assert len(cat.get_item_links()) == 1, f'Result catalog for collection {collection} and granule {granule} did not contain exactly 1 item'
    zarr_item = next(cat.get_items())
    zarr_item_asset = zarr_item.assets[next(iter(zarr_item.assets))]
    zarr_item_asset = zarr_item_asset.to_dict()
    display(JSON(zarr_item_asset))

Harmony output for https://harmony.earthdata.nasa.gov/C1996881862-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G2028350353-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1990404801-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1991231042-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1991543823-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1992649875-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1990404814-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1991243029-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1991543805-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1992672377-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1991543818-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1997506185-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1991543733-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1996984837-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1990404807-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1991274267-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1991543824-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1995305376-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

Harmony output for https://harmony.earthdata.nasa.gov/C1990404805-POCLOUD/ogc-api-coverages/1.0.0/collections/all/coverage/rangeset?forceAsync=true&granuleId=G1991290412-POCLOUD&format=application%2Fx-zarr
Contains 1 items.


<IPython.core.display.JSON object>

## Validate that zarr can be opened

This cell actually tries to open the zarr item and print the tree. It will only run if this notebook is being executed in AWS us-west-2.

TBD: what other validation can be done here?

In [15]:
# Actually open the zarr file. Will only work if this notebook is running in us-west-2
for collection, granule, job_id, result_json in collection_granule_jobid_resultjson:
    s3_creds = requests.get([l['href'] for l in result_json['links'] if l['rel'] == 'cloud-access-json'][0]).json()
    s3_key = s3_creds['AccessKeyId']
    s3_secret = s3_creds['SecretAccessKey']
    s3_token = s3_creds['SessionToken']
    
    cat = pystac.Catalog.from_file([l['href'] for l in result_json['links'] if l['rel'] == 'stac-catalog-json'][0])
    zarr_item = next(cat.get_items())
    zarr_item_asset = zarr_item.assets[next(iter(zarr_item.assets))]
    zarr_item_asset = zarr_item_asset.to_dict()

    z_store = zarr.storage.FSStore(zarr_item_asset['href'], **{'key': s3_key, 'secret': s3_secret, 'token': s3_token})
    zarr_file = zarr.open(z_store)
    
    print(f'{collection}\t{granule}')
    print(zarr_file.tree())

PermissionError: Forbidden