# Working with GRFN v2 data accessible via CMR

more about the dataset here:
https://aria.jpl.nasa.gov/node/97

Earthdata Search Link:
https://search.earthdata.nasa.gov/search/granules?p=C1379535891-ASF&pg[0][id]=*-137-*v2_0_0*&m=30.5859375!-120.515625!4!1!0!0%2C2&tl=1534106273!4!!&q=sentinel-1_insar&ok=sentinel-1_insar&sb=-123.32%2C42.00%2C-120.13%2C42.35

Things to think about:

* How to make things easier for users (in particular browsing catalog and getting into lazy xarray structure for analysis)
    * integration of CMR, STAC, intake?

In [None]:
import json
from io import StringIO
import requests
from time import sleep
import os
import pandas as pd
import boto3

In [None]:
# Set up a simple AOI (Oregon - Washington border)
# http://geojson.io

# not sure how to pass geojson for 'intersects search to cmr, just use bounding box for now'
# (lower left longitude, lower left latitude, upper right longitude, upper right latitude.)
# NOTE: this bbox includes adjacent swath 'frames', so will make it simpler for now by reducing southern limit
#bbox = '-123.32,41.55,-120.13,42.35'
bbox = '-123.32,42.00,-120.13,42.35'

aoi = json.loads('''
{
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {},
      "geometry": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              -123.321533203125,
              41.5579215778042
            ],
            [
              -120.13549804687501,
              41.5579215778042
            ],
            [
              -120.13549804687501,
              42.35042512243457
            ],
            [
              -123.321533203125,
              42.35042512243457
            ],
            [
              -123.321533203125,
              41.5579215778042
            ]
          ]
        ]
      }
    }
  ]
}
''')

In [None]:
# CMR search results should match these earthdata search results:
# two frames
# https://search.earthdata.nasa.gov/search/granules?p=C1379535891-ASF&pg[0][id]=*-137-*v2_0_0*&m=30.5859375!-120.515625!4!1!0!0%2C2&tl=1534106273!4!!&q=sentinel-1_insar&ok=sentinel-1_insar&sb=-123.32%2C41.55%2C-120.13%2C42.35
# single frame:
# https://search.earthdata.nasa.gov/search/granules?p=C1379535891-ASF&pg[0][id]=*-137-*v2_0_0*&m=30.5859375!-120.515625!4!1!0!0%2C2&tl=1534106273!4!!&q=sentinel-1_insar&ok=sentinel-1_insar&sb=-123.32%2C42.00%2C-120.13%2C42.35



path = 137
fmt = 'json'
url = f'https://cmr.earthdata.nasa.gov/search/granules.{fmt}'

params = {'collection_concept_id' : 'C1379535891-ASF',
          # NOTE: not sure how to get wildcards to work...
          #'producer_granule_id' : '*-137-*v2_0_0*',# since path and version are encoded in filename?
          'producer_granule_id' : '*GUNW*-137-*v2_0_0*',
          'options[producer_granule_id][pattern]':'true', # this seems overly complicated...
          #'attribute[]' : f'int,PATH_NUMBER,{path}',
          'temporal' : '2014-01-01T00:00:00Z', 
          'bounding_box' : bbox,
          'page_size' : 2000,
         }

r = requests.get(url, params=params, timeout=100)
print(r.url)

In [None]:
df = pd.DataFrame(json.loads(r.text)['feed']['entry'])
n= len(df)
print(f'Found {n} interferograms for path {path}')
df.sort_values('time_start', ascending=False, inplace=True) #most recent first

In [None]:
Gb = df.granule_size.astype('f4').sum()/1e3
print(f'Size of Archive [Gb] = {Gb:.2f}')

In [None]:
df.collection_concept_id.unique()
df.tail()

In [None]:
# Access via URL + URS?
# in theory this should work for GDAL>2.4
#!GDAL_HTTP_COOKIEJAR=.urs_cookies GDAL_HTTP_COOKIEFILE=.urs_cookies gdalinfo /vsicurl/{url}

print(df.producer_granule_id.iloc[0])
filename = df.producer_granule_id.iloc[0] + '.nc'
print(filename)
url = df.links.iloc[0][0]['href']
print(url)
# Ok, what about directly reading a file into memory with fsspec
#import fsspec
#files = fsspec.open_files(url, mode='r')
#with files[0] as f:
#    print(f)

## CMR returns URLS requiring URS authentication, but the data is actually on S3, how to we access directly?

In [None]:
# CMR returns URLS that nCan we access these directly via s3?
# https://www.asf.alaska.edu/sar_datasets/sentinel-1-interferograms-beta/command-line-tools/gdal/

credential_url = 'https://grfn.asf.alaska.edu/door/credentials'
response = requests.get(credential_url)
response.raise_for_status()
credentials = json.loads(response.text)['Credentials']
print(credentials)
print('Setting up new AWS Session, expires in 1 hour!')
session = boto3.session.Session(credentials['AccessKeyId'], 
                                credentials['SecretAccessKey'], 
                                credentials['SessionToken'],
                                'us-east-1',
                                )

In [None]:
s3 = session.client('s3')
#BUCKET = 'gsfc-ngap-asf-grfn-private-prod' # Changed feb 13
BUCKET = 'grfn-content-prod'
#s3.list_buckets() #access denied error
# All files:
#s3.list_objects_v2(Bucket=BUCKET)
# single file:
filename = 'S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc'
KEY = filename
s3.list_objects_v2(Bucket=BUCKET, Prefix=KEY)

In [None]:
# nicer interface compared to boto
import s3fs
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], 
                       secret=credentials['SecretAccessKey'], 
                       token=credentials['SessionToken'])


In [None]:
# Slow b/c everything is stored in that bucket! (all GRFN v2 scenes!)
#fs.ls(BUCKET)

# Download a single file to local file system
s3Path = f'{BUCKET}/{filename}'
fs.get(s3Path, filename) # Download a single file (~60Mb)

In [None]:
import xarray as xr
ds = xr.open_dataset(filename)
ds

In [None]:
# data is stored in an HDF5 'group'
ds.close() # Can't open HDF5 file twice it seems...
ds = xr.open_dataset(localPath, group='/science/grids/data', engine='h5netcdf',
                     #chunks=dict(latitude=682, longitude=1386)) #NOTE: determine chunks in advance or autochunk based on underlaying data?
                    )
ds

In [None]:
# Directly opening to in-memory dataset doesn't work, but should soon:
# https://github.com/pydata/xarray/pull/2782
# This actually seems like a bug with s3fs, creating an issue
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], 
                       secret=credentials['SecretAccessKey'], 
                       token=credentials['SessionToken'])
fileObj = fs.open(s3Path)
ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf')
ds

In [None]:
# I've mirrored to dataset to test on google cloud
# Copied data from ASF to Google Storage bucket
import gcsfs
fs = gcsfs.GCSFileSystem() #default anonymous access
#images = fs.ls('pangeo-data/grfn-v2/137/')
fileObj = fs.open(f'pangeo-data/grfn-v2/137/{filename}') #throws lots of warnings first time run

In [None]:
ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf',
                     chunks=dict(latitude=682, longitude=1386))
ds

In [None]:
# Similarly, we'll want to load a temporal stack as an xarray dataset:
# Example, all the products from the last month
#ds = xr.open_mfdataset('S1*nc', concat_dim='band', group='/science/grids/data', engine='h5netcdf')