# AWS Zarr Data Tests

### Import Modules

In [5]:
import s3fs
import numpy as np
import xarray as xr
import fsspec
import zarr
import timeit
import matplotlib.pyplot as plt
from dask.distributed import Client

### Dataset URL
<br/>
We use this URL to reference the MUR 1-km L4 dataset on the AWS Open Registry.

In [6]:
URL = 's3://mur-sst/zarr'

## Setup for Global Tests

### Open MUR Dataset
<br/>
Open the remote dataset from its S3 endpoint. The dataset is pre-consolidated, meaning the metadata is stored in a single, optimized resource. We can take advantage of this by declaring 'consolidated=True' which uses that resource to reduce the number of read operations when opening the data.

In [13]:
start_time = timeit.default_timer()

mur_global = xr.open_zarr(
    fsspec.get_mapper(URL, anon=True), 
    consolidated=True, 
).chunk({"time": 30, "lat": 100, "lon": 100})

elapsed = timeit.default_timer() - start_time
print(elapsed)

844.6709574030001


In [14]:
mur_global

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 16.70 TB 1.20 MB Shape (6443, 17999, 36000) (30, 100, 100) Count 27928801 Tasks 13932000 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 16.70 TB 1.20 MB Shape (6443, 17999, 36000) (30, 100, 100) Count 27928801 Tasks 13932000 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.17 TB,300.00 kB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 4.17 TB 300.00 kB Shape (6443, 17999, 36000) (30, 100, 100) Count 27928801 Tasks 13932000 Chunks Type int8 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,4.17 TB,300.00 kB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 16.70 TB 1.20 MB Shape (6443, 17999, 36000) (30, 100, 100) Count 27928801 Tasks 13932000 Chunks Type float32 numpy.ndarray",36000  17999  6443,

Unnamed: 0,Array,Chunk
Bytes,16.70 TB,1.20 MB
Shape,"(6443, 17999, 36000)","(30, 100, 100)"
Count,27928801 Tasks,13932000 Chunks
Type,float32,numpy.ndarray


### Add in NAN Values for Land to MUR Data
<br/>
We use the mask dimension to replace temperature values from land observations with NaN so that they are not factored in to our calculations. The mask variable has a value for each coordinate pair representing which surface the temperature was collected from (land, open-sea, ice, etc.).

In [15]:
mur_global_sst = mur_global['analysed_sst'].where(mur_global.mask == 1)

### Convert Temperatures to Celsius
<br/>
The dataset is stored with temperatures measured in Kelvin. This converts it to Celsius for ease of understanding and analysis.

In [16]:
mur_global_sst = mur_global_sst - 273.15

### Find Daily Average Temperature for Time Series

In [None]:
mur_global_sst_mean_ts = mur_global_sst.mean(['lat', 'lon'])

### Find Average Temperature for Each Coordinate Pair for Spatial Plot

In [None]:
mur_global_sst_mean_sp = mur_global_sst.mean(['time'])

## Global Tests

### Global SST Averaged Time Series

In [None]:
start_time = timeit.default_timer()

mur_global_sst_mean_ts.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

### Global SST Averaged Spatial Plot

In [None]:
start_time = timeit.default_timer()

mur_global_sst_mean_sp.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

## Setup for Regional Tests

### Period and Region of Interest

In [19]:
start_date = "2019-08-01"
end_date = "2020-1-20"

minlat = 18
maxlat = 25
minlon = -160
maxlon = -150

### Open MUR Dataset
<br/>
Open the remote dataset from its S3 endpoint. The dataset is pre-consolidated, meaning the metadata is stored in a single, optimized resource. We can take advantage of this by declaring 'consolidated=True' which uses that resource to reduce the number of read operations when opening the data.

In [20]:
start_time = timeit.default_timer()


mur_global = xr.open_zarr(fsspec.get_mapper(URL, anon=True), consolidated=True)

variables=[
    'analysed_sst', 
    'mask'
]

mur_hawaii = mur_global[variables].sel(
    time=slice(start_date, end_date),
    lat=slice(minlat, maxlat), 
    lon=slice(minlon, maxlon),
).chunk({"time": 30, "lat": 100, "lon": 100})


elapsed = timeit.default_timer() - start_time
print(elapsed)

17.777727263000088


In [21]:
mur_hawaii

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.20 MB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,67307 Tasks,528 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 485.58 MB 1.20 MB Shape (173, 701, 1001) (30, 100, 100) Count 67307 Tasks 528 Chunks Type float32 numpy.ndarray",1001  701  173,

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.20 MB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,67307 Tasks,528 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,121.39 MB,300.00 kB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,67307 Tasks,528 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 121.39 MB 300.00 kB Shape (173, 701, 1001) (30, 100, 100) Count 67307 Tasks 528 Chunks Type int8 numpy.ndarray",1001  701  173,

Unnamed: 0,Array,Chunk
Bytes,121.39 MB,300.00 kB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,67307 Tasks,528 Chunks
Type,int8,numpy.ndarray


### Add in NAN Values for Land to MUR Data
<br/>
We use the mask dimension to replace temperature values from land observations with NaN so that they are not factored in to our calculations. The mask variable has a value for each coordinate pair representing which surface the temperature was collected from (land, open-sea, ice, etc.).

In [22]:
mur_hawaii_sst = mur_hawaii['analysed_sst'].where(mur_hawaii.mask == 1)

### Convert Temperatures to Celsius
<br/>
The dataset is stored with temperatures measured in Kelvin. This converts it to Celsius for ease of understanding and analysis.

In [23]:
mur_hawaii_sst = mur_hawaii_sst - 273.15

In [None]:
mur_hawaii_sst

### Open MUR Climatology for Hawaii

In [24]:
mur_clim = xr.open_dataarray(
    "../data/MURClimatology.nc", 
    chunks={"time": 30, "lat": 100, "lon": 100}
)

In [25]:
mur_clim

Unnamed: 0,Array,Chunk
Bytes,1.03 GB,1.20 MB
Shape,"(366, 701, 1001)","(30, 100, 100)"
Count,1145 Tasks,1144 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.03 GB 1.20 MB Shape (366, 701, 1001) (30, 100, 100) Count 1145 Tasks 1144 Chunks Type float32 numpy.ndarray",1001  701  366,

Unnamed: 0,Array,Chunk
Bytes,1.03 GB,1.20 MB
Shape,"(366, 701, 1001)","(30, 100, 100)"
Count,1145 Tasks,1144 Chunks
Type,float32,numpy.ndarray


### Drop the Leap Day

In [26]:
mur_clim = mur_clim.where(mur_clim["time"] != np.datetime64('2004-02-29T09:00:00', 'ns'), drop=True)

### Create Subset

In [39]:
mur_clim_jan = mur_clim[0:20]

In [40]:
mur_clim_subset = mur_clim[212:]

In [41]:
mur_clim_subset = xr.concat([mur_clim_subset, mur_clim_jan], dim="time")

In [43]:
mur_clim_subset = mur_clim_subset.assign_coords({"time": mur_hawaii_sst["time"]})

In [44]:
mur_clim_subset

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.20 MB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,4692 Tasks,616 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 485.58 MB 1.20 MB Shape (173, 701, 1001) (30, 100, 100) Count 4692 Tasks 616 Chunks Type float32 numpy.ndarray",1001  701  173,

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.20 MB
Shape,"(173, 701, 1001)","(30, 100, 100)"
Count,4692 Tasks,616 Chunks
Type,float32,numpy.ndarray


### Create SST Anomaly Dataset

In [45]:
sst_anomaly = mur_hawaii_sst - mur_clim_subset

In [46]:
sst_anomaly

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.08 MB
Shape,"(173, 701, 1001)","(27, 100, 100)"
Count,145994 Tasks,1056 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 485.58 MB 1.08 MB Shape (173, 701, 1001) (27, 100, 100) Count 145994 Tasks 1056 Chunks Type float32 numpy.ndarray",1001  701  173,

Unnamed: 0,Array,Chunk
Bytes,485.58 MB,1.08 MB
Shape,"(173, 701, 1001)","(27, 100, 100)"
Count,145994 Tasks,1056 Chunks
Type,float32,numpy.ndarray


### Find Daily Average SST Anomaly for Time Series

In [47]:
sst_anomaly_mean_ts = sst_anomaly.mean(['lat', 'lon'])

In [53]:
sst_anomaly_mean_ts

Unnamed: 0,Array,Chunk
Bytes,692 B,108 B
Shape,"(173,)","(27,)"
Count,147446 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 692 B 108 B Shape (173,) (27,) Count 147446 Tasks 12 Chunks Type float32 numpy.ndarray",173  1,

Unnamed: 0,Array,Chunk
Bytes,692 B,108 B
Shape,"(173,)","(27,)"
Count,147446 Tasks,12 Chunks
Type,float32,numpy.ndarray


### Find Average SST Anomaly for Each Coordinate Pair for Spatial Plot

In [48]:
sst_anomaly_mean_sp = sst_anomaly.mean(['time'])

In [59]:
sst_anomaly_mean_sp

Unnamed: 0,Array,Chunk
Bytes,2.81 MB,40.00 kB
Shape,"(701, 1001)","(100, 100)"
Count,147402 Tasks,88 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.81 MB 40.00 kB Shape (701, 1001) (100, 100) Count 147402 Tasks 88 Chunks Type float32 numpy.ndarray",1001  701,

Unnamed: 0,Array,Chunk
Bytes,2.81 MB,40.00 kB
Shape,"(701, 1001)","(100, 100)"
Count,147402 Tasks,88 Chunks
Type,float32,numpy.ndarray


## Regional Tests

### Regional SST Anomaly Averaged Time Series, August 1st, 2019 - January 20th, 2020

In [49]:
start_time = timeit.default_timer()

sst_anomaly_mean_ts.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)

KeyboardInterrupt: 

### Regional SST Anomaly Averaged Spatial Plot, August 1st, 2019 - January 20th, 2020

In [None]:
start_time = timeit.default_timer()

sst_anomaly_mean_sp.plot()

elapsed = timeit.default_timer() - start_time
print(elapsed)