# Create a virtual reference to a year's worth of data

In [1]:
import earthaccess 
import xarray as xr
from virtualizarr import open_virtual_dataset
import dask

# To deal with NASA earthdata login :(
import netrc 
import fsspec
import aiohttp

In [2]:
results = earthaccess.search_data(
    short_name = 'M2SDNXSLV',
    temporal = ("2024-01-01", "2024-05-31"))

Granules found: 122


In [3]:
data_links = [granule.data_links(access="external")[0] for granule in results] #access='direct' for in-region AWS S3 us-west-2

In [4]:
(username, account, password) = netrc.netrc().authenticators("urs.earthdata.nasa.gov")
fsspec.config.conf['https'] = dict(client_kwargs={'auth': aiohttp.BasicAuth(username, password)})

In [9]:
# Parallelize open_virtual_dataset
lazy_open_virtual_dataset = dask.delayed(open_virtual_dataset)
lazy_vds = [lazy_open_virtual_dataset(f, filetype='hdf5', indexes={},  loadable_variables='time', reader_options={}) for f in data_links]


In [10]:
%%time 
# this takes a while b/c we open and scan all 122 remote netCDF files
vds_list = dask.compute(*lazy_vds) 

CPU times: user 6.8 s, sys: 1.88 s, total: 8.68 s
Wall time: 1min 18s


In [11]:
combined_vds = xr.concat(vds_list, dim='time', coords='minimal', compat='override')
combined_vds.virtualize.to_kerchunk('M2SDNXSLV.json', format='json')

In [12]:
combined_vds.virtualize.to_kerchunk('M2SDNXSLV.json', format='json')

In [13]:
!ls -ltrh M2SDNXSLV.json

-rw-r--r--@ 1 scotthenderson  staff   1.5M Jun 14 17:30 M2SDNXSLV.json


In [14]:
# Now we can use this 1.5MB file for efficient future access
ds = xr.open_dataset('M2SDNXSLV.json', engine='kerchunk')
ds