Import necessary libraries and packages

In [2]:
# Load some useful modules 
import numpy as np
import xarray as xr
import xrft
from xmitgcm import llcreader
from matplotlib import pyplot as plt
import cmocean.cm as cm
import gcm_filters
import zarr

%matplotlib inline

Import the data we want to handle

In [3]:
from intake import open_catalog

cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/ocean/llc4320.yaml")

In [4]:
# Get variables from catalog
sst = cat.LLC4320_SST.to_dask()
ssh = cat.LLC4320_SSH.to_dask()

  'dims': dict(self._ds.dims),
  'dims': dict(self._ds.dims),


In [5]:
ds = xr.merge([sst,ssh])
ds = llcreader.llcmodel.faces_dataset_to_latlon(ds, metric_vector_pairs=[])

coords = cat.LLC4320_grid.to_dask().reset_coords()
coords = llcreader.llcmodel.faces_dataset_to_latlon(coords)
# Get data on a single face 
ds_face = xr.merge([ssh.sel(face=1), sst.sel(face=1)])

# Get coordiantes from the catalog
coords_face = (cat.LLC4320_grid.to_dask()).sel(face=1)

  data = np.arange(ifac * coords.dims[vname])
  data = np.arange(jfac * coords.dims[vname])
  'dims': dict(self._ds.dims),
  data = np.arange(ifac * coords.dims[vname])
  data = np.arange(jfac * coords.dims[vname])


Specify the attributes of the filter we want to apply to our datasets

In [6]:
filter = gcm_filters.Filter(
    filter_scale=16,
    dx_min=1,
    filter_shape=gcm_filters.FilterShape.GAUSSIAN,
    grid_type=gcm_filters.GridType.REGULAR,
    grid_vars={}
)

In [7]:
ds_in_1 = ds.Eta.isel(time=slice(0,9030,120)).isel(i=slice(12000,13280), j=slice(6000,7280))

In [8]:
ds_in_2 = ds.Eta.isel(time=slice(0,9030,120)).isel(i=slice(0,1280), j=slice(4000,5280))

In [9]:
mask1 = (~np.isnan(ds_in_1)).astype('float32')
mask2 = (~np.isnan(ds_in_2)).astype('float32')

In [10]:
filter_masked1 = gcm_filters.Filter(
    filter_scale=16,
    dx_min=1,
    filter_shape=gcm_filters.FilterShape.GAUSSIAN,
    grid_type=gcm_filters.GridType.REGULAR_WITH_LAND,
    grid_vars={'wet_mask': mask1.chunk({'i':-1,'j':-1})}
)

In [11]:
filter_masked2 = gcm_filters.Filter(
    filter_scale=16,
    dx_min=1,
    filter_shape=gcm_filters.FilterShape.GAUSSIAN,
    grid_type=gcm_filters.GridType.REGULAR_WITH_LAND,
    grid_vars={'wet_mask': mask2.chunk({'i':-1,'j':-1})}
)

Filter the data and then compute it so we can use it 

In [12]:
%time ds_in_1_filtered = filter.apply(ds_in_1.chunk({'i':-1,'j':-1}), dims=['j','i'])

CPU times: user 4.16 ms, sys: 0 ns, total: 4.16 ms
Wall time: 4.13 ms


In [13]:
%time ds_in_2_filtered = filter.apply(ds_in_2.chunk({'i':-1,'j':-1}), dims=['j','i'])

CPU times: user 3.14 ms, sys: 15 μs, total: 3.15 ms
Wall time: 3.12 ms


In [14]:
ds_in_1_filtered.compute()

In [15]:
ds_in_2_filtered.compute()

In [14]:
%time ds_in_1_filtered_masked = filter_masked1.apply(ds_in_1.chunk({'i':-1,'j':-1}), dims=['j','i'])

CPU times: user 3.95 ms, sys: 0 ns, total: 3.95 ms
Wall time: 3.87 ms


In [15]:
%time ds_in_2_filtered_masked = filter_masked2.apply(ds_in_2.chunk({'i':-1,'j':-1}), dims=['j','i'])

CPU times: user 3.59 ms, sys: 0 ns, total: 3.59 ms
Wall time: 3.54 ms


In [16]:
smaller_datasets1_unfiltered = []
smaller_datasets1_filtered = []


original_i_size1 = ds_in_1.to_dataset().dims['i']
original_j_size1 = ds_in_1.to_dataset().dims['j']

# Iterate over the entire dataset to create smaller subsets
for i_start in range(0, original_i_size1, 128):
    for j_start in range(0, original_j_size1, 128):
        # Calculate the end indices for slicing
        i_end = min(i_start + 128, original_i_size1)  # Ensure i_end does not exceed 13279
        j_end = min(j_start + 128, original_j_size1)   # Ensure j_end does not exceed 7279

        
        # Subset the dataset
        ds_filtered_subset1 = ds_in_1_filtered_masked.isel(time=slice(0,76), i=slice(i_start, i_end, 4), j=slice(j_start, j_end, 4))
        ds_filtered_subset1 = ds_filtered_subset1.drop_vars(['i','j']) 
        ds_unfiltered_subset1 = ds_in_1.isel(time=slice(0,76), i=slice(i_start, i_end, 4), j=slice(j_start, j_end, 4))
        ds_unfiltered_subset1 = ds_unfiltered_subset1.drop_vars(['i','j']) 
        
        # Append the subsets to the respective lists
        smaller_datasets1_filtered.append(ds_filtered_subset1)
        smaller_datasets1_unfiltered.append(ds_unfiltered_subset1)


  original_i_size1 = ds_in_1.to_dataset().dims['i']
  original_j_size1 = ds_in_1.to_dataset().dims['j']


We do the exact same thing with our second dataset

In [17]:
smaller_datasets2_unfiltered = []
smaller_datasets2_filtered = []

original_i_size2 = ds_in_2.to_dataset().dims['i']
original_j_size2 = ds_in_2.to_dataset().dims['j']

# Iterate over the entire dataset to create smaller subsets
for i_start in range(0, original_i_size2, 128):
    for j_start in range(0, original_j_size2, 128):
        # Calculate the end indices for slicing
        i_end = min(i_start + 128, original_i_size2)  # Ensure i_end does not exceed 13279
        j_end = min(j_start + 128, original_j_size2)   # Ensure j_end does not exceed 7279

        # Subset the dataset
        ds_unfiltered_subset2 = ds_in_2.isel(time=slice(0,76), i=slice(i_start, i_end, 4), j=slice(j_start, j_end, 4))
        ds_unfiltered_subset2 = ds_unfiltered_subset2.drop_vars(['i','j'])
        ds_filtered_subset2 = ds_in_2_filtered_masked.isel(time=slice(0,76), i=slice(i_start, i_end, 4), j=slice(j_start, j_end, 4))
        ds_filtered_subset2 = ds_filtered_subset2.drop_vars(['i','j'])
        
        # Append the subsets to the respective lists
        smaller_datasets2_unfiltered.append(ds_unfiltered_subset2)
        smaller_datasets2_filtered.append(ds_filtered_subset2)

  original_i_size2 = ds_in_2.to_dataset().dims['i']
  original_j_size2 = ds_in_2.to_dataset().dims['j']


Now we 

In [18]:
ds2_filtered = xr.concat(smaller_datasets2_filtered, dim='dataset')
ds2_unfiltered = xr.concat(smaller_datasets2_unfiltered, dim='dataset')
ds1_filtered = xr.concat(smaller_datasets1_filtered, dim='dataset')
ds1_unfiltered = xr.concat(smaller_datasets1_unfiltered, dim='dataset')

In [29]:
ds4 = xr.open_dataset(
    "gs://leap-persistent/funky-user/ds1_unfiltered.zarr", engine="zarr", chunks={}
)  #

In [33]:
ds5 = xr.open_dataset(
    "gs://leap-persistent/funky-user/ds1_filtered.zarr", engine="zarr", chunks={}
)  #

In [30]:
ds4

Unnamed: 0,Array,Chunk
Bytes,29.69 MiB,2.00 kiB
Shape,"(100, 76, 32, 32)","(1, 1, 32, 16)"
Dask graph,15200 chunks in 2 graph layers,15200 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 29.69 MiB 2.00 kiB Shape (100, 76, 32, 32) (1, 1, 32, 16) Dask graph 15200 chunks in 2 graph layers Data type float32 numpy.ndarray",100  1  32  32  76,

Unnamed: 0,Array,Chunk
Bytes,29.69 MiB,2.00 kiB
Shape,"(100, 76, 32, 32)","(1, 1, 32, 16)"
Dask graph,15200 chunks in 2 graph layers,15200 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [36]:
ds5

Unnamed: 0,Array,Chunk
Bytes,29.69 MiB,4.00 kiB
Shape,"(100, 76, 32, 32)","(1, 1, 32, 32)"
Dask graph,7600 chunks in 2 graph layers,7600 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 29.69 MiB 4.00 kiB Shape (100, 76, 32, 32) (1, 1, 32, 32) Dask graph 7600 chunks in 2 graph layers Data type float32 numpy.ndarray",100  1  32  32  76,

Unnamed: 0,Array,Chunk
Bytes,29.69 MiB,4.00 kiB
Shape,"(100, 76, 32, 32)","(1, 1, 32, 32)"
Dask graph,7600 chunks in 2 graph layers,7600 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
