In [1]:
import pandas as pd
import xarray as xr
import os
from pathlib import Path
from glob import glob
import numpy as np

In [2]:
# glider id from ngdac
glider = "ng230-20210928T0000"

# urls and directories of data sources
url_glider = "https://data.ioos.us/gliders/erddap"

# Setup data paths
data_dir = Path("/Users/mikesmith/Documents/data/")
glider_dir = data_dir / "gliders"
impact_dir = data_dir / "impact_metrics"
impact_calculated_dir = impact_dir / "calculated"
impact_model_dir = impact_dir / "models"

# Create directories if they don't exist
os.makedirs(data_dir, exist_ok=True)
os.makedirs(glider_dir, exist_ok=True)
os.makedirs(impact_dir, exist_ok=True)
os.makedirs(impact_model_dir, exist_ok=True)
os.makedirs(impact_calculated_dir, exist_ok=True)

In [3]:
def find_nearest(array, value):
    idx = (np.abs(array-value)).argmin()
    return array.flat[idx], idx

In [4]:
# Read glider dataframe output from erddap
glider_pickle = glider_dir / f"{glider}_data.pkl"

try:
    df = pd.read_pickle(glider_pickle)
except FileNotFoundError:
    # Download glider data from erddap with dataset id
    df = get_glider_by_id(glider)
    df.to_pickle(glider_pickle) # Save glider data to pickle file
    
t0 = df.index.min().strftime("%Y-%m-%d")
t1 = df.index.max().strftime("%Y-%m-%d")
df.reset_index(inplace=True)

# Glider - Iterate grouped glider times (each time is a profile)
glider_time = []
glider_lon = []
glider_lat = []

for time, group in df.groupby('time (UTC)'):
    glider_time.append(time)
    glider_lon.append(group['longitude (degrees_east)'].unique()[0])
    glider_lat.append(group['latitude (degrees_north)'].unique()[0])

In [5]:
# # RTOFS
# # Load in RTOFS files locally
# rtofs_files = []
# for date in pd.date_range(t0, t1).to_list():
#     files = glob(os.path.join(url_rtofs, date.strftime('rtofs.%Y%m%d'), '*.nc'))
#     for f in files:
#         if f == '':
#             continue
#         else:
#             rtofs_files.append(f)

In [6]:
# # Load rtofs data and rename the variables
# rtofs = xr.open_mfdataset(sorted(rtofs_files),
#                           parallel=True,
#                           )
rtofs = xr.open_zarr("/Users/mikesmith/Documents/data/rtofs_rechunked.zarr")
rtofs

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  rtofs = xr.open_zarr("/Users/mikesmith/Documents/data/rtofs_rechunked.zarr")


Unnamed: 0,Array,Chunk
Bytes,4.84 MiB,310.97 kiB
Shape,"(1710, 742)","(428, 186)"
Count,17 Tasks,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.84 MiB 310.97 kiB Shape (1710, 742) (428, 186) Count 17 Tasks 16 Chunks Type float32 numpy.ndarray",742  1710,

Unnamed: 0,Array,Chunk
Bytes,4.84 MiB,310.97 kiB
Shape,"(1710, 742)","(428, 186)"
Count,17 Tasks,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.84 MiB,310.97 kiB
Shape,"(1710, 742)","(428, 186)"
Count,17 Tasks,16 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.84 MiB 310.97 kiB Shape (1710, 742) (428, 186) Count 17 Tasks 16 Chunks Type float32 numpy.ndarray",742  1710,

Unnamed: 0,Array,Chunk
Bytes,4.84 MiB,310.97 kiB
Shape,"(1710, 742)","(428, 186)"
Count,17 Tasks,16 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 207.22 GiB 0.99 GiB Shape (1096, 40, 1710, 742) (1096, 40, 57, 106) Count 211 Tasks 210 Chunks Type float32 numpy.ndarray",1096  1  742  1710  40,

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 207.22 GiB 0.99 GiB Shape (1096, 40, 1710, 742) (1096, 40, 57, 106) Count 211 Tasks 210 Chunks Type float32 numpy.ndarray",1096  1  742  1710  40,

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 207.22 GiB 0.99 GiB Shape (1096, 40, 1710, 742) (1096, 40, 57, 106) Count 211 Tasks 210 Chunks Type float32 numpy.ndarray",1096  1  742  1710  40,

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 207.22 GiB 0.99 GiB Shape (1096, 40, 1710, 742) (1096, 40, 57, 106) Count 211 Tasks 210 Chunks Type float32 numpy.ndarray",1096  1  742  1710  40,

Unnamed: 0,Array,Chunk
Bytes,207.22 GiB,0.99 GiB
Shape,"(1096, 40, 1710, 742)","(1096, 40, 57, 106)"
Count,211 Tasks,210 Chunks
Type,float32,numpy.ndarray


In [None]:
rtofs.close()

In [None]:
# Save rtofs lon and lat as variables to speed up indexing calculation
# rtofs_time = rtofs.time.values
rtofs_lon = rtofs.lon.values
rtofs_lat = rtofs.lat.values

# Find index of nearest lon and lat points
idx_lon = []
for lon in glider_lon:
    val, ind = find_nearest(rtofs_lon[0, :], lon)
    idx_lon.append(ind)
    
idx_lat = []
for lat in glider_lat:
    val, ind = find_nearest(rtofs_lat[:, 0], lat)
    idx_lat.append(ind)

In [None]:
# Create dataarrays for pointwise indexing
# https://stackoverflow.com/questions/40544846/read-multiple-coordinates-with-xarray
lons_rtofs = xr.DataArray(idx_lon, dims='point')
lats_rtofs = xr.DataArray(idx_lat, dims='point')
times = xr.DataArray(glider_time, dims='point')

In [None]:
# Select by time and spatial using nearest neighbor selection
trtofs_orig = rtofs[['temperature', 'salinity']].sel(Y=lats_rtofs, X=lons_rtofs, method='nearest')
trtofs_orig

In [None]:
from dask.diagnostics import ProgressBar

# or distributed.progress when using the distributed scheduler
delayed_obj = rtofs.to_netcdf(impact_model_dir / f"{glider}_chunk_rtofs_data.nc", compute=False)

with ProgressBar():
    results = delayed_obj.compute()