# Create JSONS with fsspec ReferenceMaker
## 24 hours of GOES data
Needed:
- fsspec-reference-maker
    - `pip install git+https://github.com/intake/fsspec-reference-maker`
- adlfs >= 0.7.7
    - `pip install --upgrade adlfs>=0.7.7`

In [None]:
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr

In [None]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import zipfile
import logging
import fsspec
import ujson
from tqdm import tqdm
from glob import glob
import os

from azure.storage.blob import ContainerClient
import tempfile

import dask

In [None]:
import adlfs
fs = adlfs.AzureBlobFileSystem(account_name="goeseuwest")

fs.ls("noaa-goes16")


In [None]:
fs = fsspec.filesystem('az', account_name='goeseuwest')
fs.glob('az://noaa-goes16/ABI-L2-MCMIPF/2020/002/00/*.nc')

In [None]:
fs.glob 

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 10
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Pangeo Worker', 
                                      propagate_env=True)

In [None]:
from dask.distributed import Client
client = Client()
client

## Get urls

In [None]:
tempdir = os.path.join(tempfile.gettempdir(), 'goes')
os.makedirs(tempdir, exist_ok=True)

product = 'ABI-L2-MCMIPF'
syear = '2020'; sday='002'; shour = '14'

storage_account_url = 'https://goes.blob.core.windows.net'
container_name = 'noaa-goes16'
goes_blob_root = storage_account_url + '/' + container_name + '/'

goes_container_client = ContainerClient(account_url=storage_account_url, container_name=container_name, credential=None)

def download_url(url):
    url_as_filename = url.replace('://', '_').replace('/','_')
    destination_filename = os.path.join(tempdir, url_as_filename)
    urllib.request.urlretrieve(url, destination_filename)
    return destination_filename

prefix = product + '/' + syear + '/' + sday + '/' 
print('Finding blobs matching prefex: {}'.format(prefix))
generator = goes_container_client.list_blobs(name_starts_with=prefix)
blobs = []
for blob in generator:
    blobs.append(blob.name)

In [None]:
urllist = ['az://' + container_name + '/' + u  for u in blobs]

In [None]:
urllist[0]

In [None]:
#storage_options = {'account_name': 'azureopendatastorage'}
#ddf = dd.read_parquet('az://nyctlc/green/puYear=2019/puMonth=*/*.parquet', storage_options=storage_options)

In [None]:
#storage_options = {'account_name': ''https://goes.blob.core.windows.net''}
#fs.ls('az://noaa-goes16/ABI-L2-MCMIPF/'')

In [None]:
client.close()

In [None]:
fs = fsspec.filesystem("az", account_name="noaa-goes16")

In [None]:
fs.ls('az://noaa-goes16/ABI-L2-MCMIPF/')

### Generate json function

In [None]:
def gen_json(u):
    so = dict(
        mode="rb", anon=True, default_fill_cache=False, default_cache_type="none"
    )
    with fsspec.open(u, **so, account_name='goeseuwest') as inf:
        h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outf:
            outf.write(ujson.dumps(h5chunks.translate()).encode())


### Use dask to make jsons

In [None]:
%%time
dask.compute(*[dask.delayed(gen_json)(u) for u in urllist]);

In [None]:
`fs = fsspec.filesystem("az", account_name="noass-goes16")
` should work.


## MultiZarr

In [None]:
json_list = sorted(glob("jsons/*.json"))

In [None]:
mzz = MultiZarrToZarr(
    json_list,
    remote_protocol='az',
    remote_options={
       'account_name' : 'goeseuwest'
    },    
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'decode_coords' : False,
    },
    xarray_concat_args={
        "data_vars": "minimal",
        "coords": "minimal",
        "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "t"

    }
)

In [None]:
%%time
%%prun -D multizarr_profile 
mzz.translate('combined.json')

In [None]:
client.shutdown()

***
## Processing times:
|Action | Time | Note |
|-------:|:------| :---|
|Make individual jsons | 26min 39s | 4 workers, faster times can be achieved with more dask workers |
|Make combined json | 55min 6s | don't think this can be sped up w/ dask |
|Make combined v2 | 45min 52s | Actual CPU time is 25min, might try `az://` instead of `abfs`|
| __Total__ | __1h 21min__ | 