In [None]:
import s3fs
import requests
from urllib import request
from http.cookiejar import CookieJar
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from json import dumps
from io import StringIO
from os.path import dirname, join
import netrc

import os
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [None]:
import fsspec_reference_maker
fsspec_reference_maker.__version__

## set up earthdata login credentials
- code for setting up earthdata_login from [here](https://github.com/podaac/tutorials/blob/master/notebooks/cloudwebinar/cloud_direct_access_s3.py)
- for the earthdata login to work you need to create a .netrc file on your home directory
- .netrc file contains:\
machine urs.earthdata.nasa.gov\
login 'earthdata username'\
password 'password'

In [None]:
#########Setting up earthdata login credentials 
def setup_earthdata_login_auth(endpoint):
    """
    Set up the request library so that it authenticates against the given Earthdata Login
    endpoint and is able to track cookies between requests.  This looks in the .netrc file 
    first and if no credentials are found, it prompts for them.
    Valid endpoints:
        urs.earthdata.nasa.gov - Earthdata Login production
    """
    try:
        username, _, password = netrc.netrc().authenticators(endpoint)
    except (FileNotFoundError, TypeError):
        # FileNotFound = There's no .netrc file
        # TypeError = The endpoint isn't in the netrc file, causing the above to try unpacking None
        print("There's no .netrc file or the The endpoint isn't in the netrc file")

    manager = request.HTTPPasswordMgrWithDefaultRealm()
    manager.add_password(None, endpoint, username, password)
    auth = request.HTTPBasicAuthHandler(manager)

    jar = CookieJar()
    processor = request.HTTPCookieProcessor(jar)
    opener = request.build_opener(auth, processor)
    request.install_opener(opener)

###############################################################################
edl="urs.earthdata.nasa.gov"
setup_earthdata_login_auth(edl)

def begin_s3_direct_access():
    url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
    response = requests.get(url).json()
    return s3fs.S3FileSystem(key=response['accessKeyId'],secret=response['secretAccessKey'],token=response['sessionToken'],client_kwargs={'region_name':'us-west-2'})


In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

## Create a list of all MUR files that are on the PODAAC Cloud

In [None]:
fs = begin_s3_direct_access()
flist = []
for lyr in range(2002,2003): #2022):
    for imon in range(12,13): #(1,13):
        fstr = str(lyr)+str(imon).zfill(2)+'*.nc'
        files = fs.glob(join("podaac-ops-cumulus-protected/", "MUR-JPL-L4-GLOB-v4.1", fstr))
        for file in files:
            flist.append(file)
print('total number of individual netcdf files:',len(flist))

In [None]:
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  

In [None]:
json_dir = 's3://esip-qhub/nasa/mur/jsons/'

- file to output merged mur file

In [None]:
#If the directory exists, remove it (and all the files)
try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[4][0:8] #p[3]
        fname = p[4] #p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [None]:
%%time
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls[0:30]], retries=10);

In [None]:
flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
furls[0]

In [None]:
client.close(); cluster.shutdown()

In [None]:
from dask.distributed import Client

In [None]:
client = Client()

In [None]:
client

In [None]:
mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate('mur_consolidated.json')

In [None]:
rpath = 's3://esip-qhub-public/nasa/mur/mur4.1_consolidated.json'

In [None]:
fs2.put_file(lpath='mur_consolidated.json', rpath=rpath)

# testing

#### Try a single json

In [None]:
url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [None]:
turl ='s3://esip-qhub/nasa/mur/jsons/20021201.20021201090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc.json'

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'key':response['accessKeyId'],
 'secret':response['secretAccessKey'],
 'token':response['sessionToken'],
 'client_kwargs':{'region_name':'us-west-2'}}

fs = fsspec.filesystem("reference", fo=turl, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)

In [None]:
ds

#### Try the consolidated JSON

In [None]:
r_opts = {'key':response['accessKeyId'],
 'secret':response['secretAccessKey'],
 'token':response['sessionToken'],
 'client_kwargs':{'region_name':'us-west-2'}}

In [None]:
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
fs = fsspec.filesystem("reference", fo=rpath, ref_storage_args=s_opts,
                       remote_protocol='s3', remote_options=r_opts)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)

In [None]:
ds

In [None]:
import hvplot.xarray

In [None]:
sst = ds['analysed_sst'].sel(time='2002-12-20 12:00', method='nearest').load()
sst.hvplot.quadmesh(x='lon', y='lat', geo=True, rasterize=True, cmap='turbo' )

In [None]:
client.close(); cluster.shutdown()

In [None]:
oh