# S3 Access ICESat-2 ATL03

In [1]:
# hide warning messages 
import warnings
warnings.filterwarnings('ignore')
# import python modules
import geopandas as gpd
import requests
import pandas as pd
import datetime as dt 
import h5py
import numpy as np
import s3fs
from shapely.ops import orient
from os import path, remove
from posixpath import splitext
from requests.adapters import HTTPAdapter, Retry
import concurrent.futures

The following input parameters are needed, passed through the front-end.

In [2]:
# start time
start_date = dt.datetime(2017, 1, 1) 
# end time
end_date = dt.datetime(2020, 1, 31)
# atl03 variables of interests as a list
variables = ['heights/lat_ph', 'heights/lon_ph', 'heights/h_ph', 'heights/signal_conf_ph', 'geophys_corr/dem_h', 'bckgrd_atlas/bckgrd_int_height', 'signal_find_output/land/z_pc_delta']
# path to polygon geojson file define subset area
poly_f = 'poly.json'
# read the polygon json defining the subset area
poly_gpd = gpd.read_file(poly_f)

Let's define some additional variables.

In [3]:
# nsidc s3 credentials
nsidc_s3 = f"https://data.nsidc.earthdatacloud.nasa.gov/s3credentials"
# cmr api url
cmrurl='https://cmr.earthdata.nasa.gov/search/' 
# ATL03 (Earthdata Cloud) concept_id
concept_id = 'C2596864127-NSIDC_CPRD' 
# CMR datetime format
dt_format = '%Y-%m-%dT%H:%M:%SZ'
# six beams of ICESat-2
beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
# core variables to include in every subset request
headers = ['geolocation/reference_photon_lat', 'geolocation/reference_photon_lon', 'geolocation/ph_index_beg', 'geolocation/segment_ph_cnt', 'geolocation/delta_time', 'beam', 'granule']

Let's create a session to avoid 500 request errors.

In [4]:
# creating a backoff
s = requests.Session()
retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))

Now, let's search for ATL03 granules.

In [5]:
# CMR time bounds
temporal = start_date.strftime(dt_format) + ',' + end_date.strftime(dt_format)
# array to store atl03_f
alt03_f = []

# function to search CMR 
page_num = 1
poly_gpd.geometry = poly_gpd.geometry.apply(orient, args=(1,))
geojson = {"shapefile": ("poly.json", poly_gpd.geometry.to_json(), "application/geo+json")}

while True:
     # defining parameters
    cmr_param = {
        "collection_concept_id": concept_id, 
        "temporal": temporal,
        "page_size": 2000,
        "page_num": page_num,
        "simplify-shapefile": 'true'}

    granulesearch = cmrurl + 'granules.json'
    response = requests.post(granulesearch, data=cmr_param, files=geojson)
    granules = response.json()['feed']['entry']
    if granules:
        for g in granules:
            # Get URL of HDF5 files
            for links in g['links']:
                if links['href'].startswith('s3://') and links['href'].endswith('.h5'):
                    if links['href'] not in alt03_f: 
                        alt03_f.append(links['href'])

        page_num += 1
    else: 
        break

alt03_f

['s3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2018/10/20/ATL03_20181020171526_03370109_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2018/10/23/ATL03_20181023052534_03750113_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/01/19/ATL03_20190119125513_03370209_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/01/22/ATL03_20190122010525_03750213_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/04/20/ATL03_20190420083508_03370309_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/04/22/ATL03_20190422204515_03750313_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/07/20/ATL03_20190720041439_03370409_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/07/22/ATL03_20190722162449_03750413_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/2019/08/18/ATL03_20190818025051_07790409_006_02.h5',
 's3://nsidc-cumulus-prod-protected/ATLAS/ATL03/006/201

The list `alt03_f` will now have S3 links to ATL03 granules.

We will now get S3 temporary credentials and define the S3FS S3FileSystem object. Make sure that `.netrc` file is defined as described here: https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+cURL+And+Wget.

In [6]:
# get s3 credentials
s3credentials = s.get(nsidc_s3).json()
# defining S3FS object
fs_s3 = s3fs.S3FileSystem(anon=False, 
                          key=s3credentials['accessKeyId'], 
                          secret=s3credentials['secretAccessKey'], 
                          token=s3credentials['sessionToken'])

We will now define a function to spatially subset the ATL03 granules. 

In [7]:
def subset_atl03(t, var_l, poly, s3_url):
    with fs_s3.open(s3_url, mode='rb') as fh:
        with h5py.File(fh) as hf:
            granule_f = path.basename(s3_url)
            out_csv = f'{splitext(granule_f)[0]}_{int(dt.datetime.utcnow().timestamp())}.csv'
            for var in list(hf.keys()):
                if var.startswith('gt') and 'geolocation' in list(hf[var].keys()):
                    lat = hf[f'{var}/{var_l[0]}'][:]
                    lon = hf[f'{var}/{var_l[1]}'][:]
                    df = pd.DataFrame({var_l[0]: lat, var_l[1]: lon})
                    df[var_l[2]] = hf[f'{var}/{var_l[2]}'][:]
                    df[var_l[3]] = hf[f'{var}/{var_l[3]}'][:]
                    df[var_l[4]] = hf[f'{var}/{var_l[4]}'][:]
                    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df[var_l[1]], df[var_l[0]])) 
                    gdf_poly = gdf[gdf['geometry'].within(poly.geometry[0])]   
                    if not gdf_poly.empty:
                        if not path.exists(out_csv):
                            with open(out_csv, "w") as f:
                                f.write(','.join(var_l)+'\n')

                        gdf_poly['beam'] = np.repeat(str(var), len(gdf_poly.index)).tolist()
                        gdf_poly['granule'] = np.repeat(str(granule_f), len(gdf_poly.index)).tolist()

                        for v in var_l[t:]:
                            if v not in var_l[:t]:
                                gdf_poly[v] = None

                        # retrieving variables of interests
                        for _, df_gr in gdf_poly.groupby((gdf_poly.index.to_series().diff() > 1).cumsum()):
                            i = df_gr.index.min()
                            j = df_gr.index.max()
                            for v in var_l[t:]:
                                if v not in var_l[:t]:
                                    if v.startswith('heights'):
                                        gdf_poly.loc[i:j, (v)][:] = [hf[f'{var}/{v}'][x1-1:x1+x2-1].tolist() 
                                                                     for x1, x2 in zip(df_gr[var_l[2]], df_gr[var_l[3]])]
                                    elif v.startswith('bckgrd_atlas'):
                                        delta_t = hf[f'{var}/bckgrd_atlas/delta_time'][:]
                                        v_list = hf[f'{var}/{v}'][(delta_t>=df_gr[var_l[4]].min()) & 
                                                                  (delta_t<=df_gr[var_l[4]].max())].tolist()
                                        gdf_poly.loc[i:i+len(v_list)-1, (v)][:] = v_list
                                    elif v.startswith('signal_find_output'):
                                        delta_t = hf[f'{var}/signal_find_output/{v.split("/")[-2]}/delta_time'][:]
                                        v_list = hf[f'{var}/{v}'][(delta_t>=df_gr[var_l[4]].min()) & 
                                                                  (delta_t<=df_gr[var_l[4]].max())].tolist()
                                        gdf_poly.loc[i:i+len(v_list)-1, (v)][:] = v_list
                                    else:
                                        gdf_poly.loc[i:j, (v)][:] = hf[f'{var}/{v}'][i:j+1].tolist()

                        # saving the output file
                        gdf_poly.to_csv(out_csv, mode='a', index=False, header=False, columns=var_l)
    if path.exists(out_csv):                         
        return out_csv

To the above function `subset_atl03`, we will pass a list `headers` and `variables` containing the variables of interest, a geopandas dataframe `poly_gpd` containing subset area polygon, and a list `alt03_f` containing the list of S3 links to ATL03 files.

In [8]:
# subset using asynchronous calls
futures = []
subset_f = []
subset_pool = concurrent.futures.ThreadPoolExecutor()
for s3_url in sorted(alt03_f):
    future = subset_pool.submit(subset_atl03, len(headers), headers+variables, poly_gpd, s3_url)
    futures.append(future)
futures, _ = concurrent.futures.wait(futures)
for future in futures:
    result = future.result()
    subset_f.append(result)

The variable `subset_f` will now contain a list of CSV files that were created in the above step.  These CSV files contains the subset data. Now, we will read these subset CSV files and print out the results.

In [9]:
# reading the CSV
subset_f = [x for x in subset_f if x is not None]
subset_df = pd.concat(pd.read_csv(f, header=0) for f in subset_f)
subset_df.reset_index(inplace=True)
# deleting the CSV
for f in subset_f:
    if path.isfile(f):
        remove(f)
# printing the pandas dataframe as json
subset_df.to_json()

'{"index":{"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"6":6,"7":7,"8":8,"9":9,"10":10,"11":11,"12":12,"13":13,"14":14,"15":15,"16":16,"17":17,"18":18,"19":19,"20":20,"21":21,"22":22,"23":23,"24":24,"25":25,"26":26,"27":27,"28":28,"29":29,"30":30,"31":31,"32":32,"33":33,"34":34,"35":35,"36":36,"37":37,"38":38,"39":39,"40":40,"41":41,"42":42,"43":43,"44":44,"45":45,"46":46,"47":47,"48":48,"49":49,"50":50,"51":0,"52":1,"53":2,"54":3,"55":4,"56":5,"57":6,"58":7,"59":8,"60":9,"61":10,"62":11,"63":12,"64":13,"65":14,"66":15,"67":16,"68":17,"69":18,"70":19,"71":20,"72":21,"73":22,"74":23,"75":24,"76":25,"77":26,"78":27,"79":28,"80":29,"81":30,"82":31,"83":32,"84":33,"85":34,"86":35,"87":36,"88":37,"89":38,"90":39,"91":40,"92":41,"93":42,"94":43,"95":44,"96":45,"97":46,"98":47,"99":48,"100":49,"101":50,"102":51,"103":52,"104":53,"105":54,"106":55,"107":56,"108":57,"109":58,"110":59,"111":60,"112":61,"113":62,"114":63,"115":64,"116":65,"117":66,"118":67,"119":68,"120":69,"121":70,"122":71,"123":72,"12