# Inspection of files


In [1]:
import fsspec
import xarray as xr
import hvplot.xarray

In [2]:
fs = fsspec.filesystem('s3', anon=True)

In [3]:
flist = fs.ls('s3://noaa-nwm-retrospective-2-1-pds/')
flist

['noaa-nwm-retrospective-2-1-pds/forcing',
 'noaa-nwm-retrospective-2-1-pds/index.html',
 'noaa-nwm-retrospective-2-1-pds/model_output']

In [4]:
flist = fs.glob('noaa-nwm-retrospective-2-1-pds/model_output/*')
print(flist[0])
print(flist[-1])

noaa-nwm-retrospective-2-1-pds/model_output/1979
noaa-nwm-retrospective-2-1-pds/model_output/2020


In [5]:
flist = fs.glob('noaa-nwm-retrospective-2-1-pds/model_output/1980/*LDAS*')
flist[:5]

['noaa-nwm-retrospective-2-1-pds/model_output/1980/198001010000.LDASOUT_DOMAIN1.comp',
 'noaa-nwm-retrospective-2-1-pds/model_output/1980/198001010300.LDASOUT_DOMAIN1.comp',
 'noaa-nwm-retrospective-2-1-pds/model_output/1980/198001010600.LDASOUT_DOMAIN1.comp',
 'noaa-nwm-retrospective-2-1-pds/model_output/1980/198001010900.LDASOUT_DOMAIN1.comp',
 'noaa-nwm-retrospective-2-1-pds/model_output/1980/198001011200.LDASOUT_DOMAIN1.comp']

Okay, so at this point we've learned that we have 3-hourly output over roughly 40 years

In [6]:
# %%time
# flist = fs.glob('noaa-nwm-retrospective-2-1-pds/model_output/*/*LDAS*')   # this is slow
40 * 365 * 24 / 3

116800.0

So about 117,000 files! 

In [7]:
ds = xr.open_dataset(fs.open(flist[0]), chunks={})

In [18]:
ds.data_vars

Data variables:
    crs       |S1 ...
    COSZ      (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    FSA       (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    FIRA      (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    HFX       (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    LH        (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    EDIR      (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    ALBEDO    (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    UGDRNOFF  (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    TRAD      (time, y, x) float64 dask.array<chunksize=(1, 3840, 4608), meta=np.ndarray>
    SOIL_W    (time, y, soil_layers_stag, x) float64 dask.array<chunksize=(1, 3840, 4, 4608), meta=np.ndarray>
    SOIL_M    (time, y, soil_layers_stag,

In [19]:
ds = ds[['ACCET', 'SNEQV', 'FSNO']]

In [20]:
ds

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 135.00 MiB 135.00 MiB Shape (1, 3840, 4608) (1, 3840, 4608) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",4608  3840  1,

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 135.00 MiB 135.00 MiB Shape (1, 3840, 4608) (1, 3840, 4608) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",4608  3840  1,

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 135.00 MiB 135.00 MiB Shape (1, 3840, 4608) (1, 3840, 4608) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",4608  3840  1,

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [21]:
ds.ACCET

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 135.00 MiB 135.00 MiB Shape (1, 3840, 4608) (1, 3840, 4608) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",4608  3840  1,

Unnamed: 0,Array,Chunk
Bytes,135.00 MiB,135.00 MiB
Shape,"(1, 3840, 4608)","(1, 3840, 4608)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


The data is chunked as full spatial domain and 1 time step, with about 135MB chunk size.   This is actually great for visualization of maps at specific time steps or for calculations that involve the entire dataset. So kerchunking this data would be a nice first step. 

Let's check one out:

In [28]:
%%time
da = ds.ACCET.load()

CPU times: user 664 ms, sys: 164 ms, total: 828 ms
Wall time: 3.81 s


In [29]:
da.hvplot(x='x', y='y', rasterize=True, cmap='turbo', data_aspect=1)