Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve nanosecond resolution when encoding/decoding times #7827

Merged
merged 26 commits into from Sep 17, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
db94bb9
preserve nanosecond resolution when encoding/decoding times.
kmuehlbauer Aug 31, 2023
82b74ec
Apply suggestions from code review
kmuehlbauer Sep 4, 2023
8da55ac
use emit_user_level_warning
kmuehlbauer Sep 4, 2023
96f60fc
move time alignment for nc3 to encode_nc3_variable
kmuehlbauer Sep 4, 2023
4f6440a
fix test for encode_cf_timedelta
kmuehlbauer Sep 4, 2023
f8e961f
fix CFMaskCoder for time-like (also allow timedelta64), add first tests
kmuehlbauer Sep 4, 2023
c97d7b4
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 5, 2023
06ed4bf
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 5, 2023
5e4a5ee
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 7, 2023
2365da0
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
d020bbc
rename to _unpack_time_units_and_ref_date as suggested in review
kmuehlbauer Sep 10, 2023
a8c6057
refactor delta -> time_units as suggested in review
kmuehlbauer Sep 10, 2023
9b96ff7
refactor out function _time_units_to_timedelta64, reorder flow and re…
kmuehlbauer Sep 10, 2023
5adb58e
import _is_time_like from coding.variables
kmuehlbauer Sep 11, 2023
87fbb1a
adapt tests, add _numpy_to_netcdf_timeunit-conversion function
kmuehlbauer Sep 11, 2023
1fbc881
adapt tests, add _numpy_to_netcdf_timeunit-conversion function
kmuehlbauer Sep 11, 2023
8a6ecb5
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
c1f810c
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 11, 2023
7f2b8b4
Merge branch 'main' into nanoseconds-resolution
kmuehlbauer Sep 13, 2023
d538ea9
adapt test as per review, remove arm_xfail for backend test
kmuehlbauer Sep 13, 2023
a75e4b8
add whats-new.rst entry
kmuehlbauer Sep 13, 2023
d4a71cd
Update doc/whats-new.rst
kmuehlbauer Sep 14, 2023
4dca66e
Update doc/whats-new.rst
kmuehlbauer Sep 14, 2023
ee1b939
Merge remote-tracking branch 'origin/main' into nanoseconds-resolution
kmuehlbauer Sep 16, 2023
ebb00b8
fix whats-new.rst
kmuehlbauer Sep 16, 2023
c4d4a92
Merge remote-tracking branch 'origin/main' into nanoseconds-resolution
kmuehlbauer Sep 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 28 additions & 1 deletion xarray/backends/netcdf3.py
Expand Up @@ -88,13 +88,40 @@ def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}


def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility

# should we import this from coding.times here?
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
time_strings = [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"since",
]
data = var.data
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None:
if any(tstr in units for tstr in time_strings):
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
return data


def encode_nc3_variable(var):
for coder in [
coding.strings.EncodedStringCoder(allows_unicode=False),
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var)
data = coerce_nc3_dtype(var.data)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)

Expand Down
85 changes: 65 additions & 20 deletions xarray/coding/times.py
Expand Up @@ -25,6 +25,7 @@
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import Variable

try:
Expand Down Expand Up @@ -171,6 +172,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def _unpack_delta_ref_date(units):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
delta, _ref_date = _unpack_netcdf_time_units(units)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
return delta, ref_date


def _decode_cf_datetime_dtype(
data, units: str, calendar: str, use_cftime: bool | None
) -> np.dtype:
Expand Down Expand Up @@ -251,9 +266,12 @@ def _decode_datetime_with_pandas(

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype(
np.int64
)
# properly handle NaN/NaT to prevent casting NaN to int
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[delta]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
Expand Down Expand Up @@ -573,6 +591,9 @@ def _should_cftime_be_used(

def _cleanup_netcdf_time_units(units: str) -> str:
delta, ref_date = _unpack_netcdf_time_units(units)
delta = delta.lower()
if not delta.endswith("s"):
delta = f"{delta}s"
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
try:
units = f"{delta} since {format_timestamp(ref_date)}"
except (OutOfBoundsDatetime, ValueError):
Expand Down Expand Up @@ -633,32 +654,41 @@ def encode_cf_datetime(
"""
dates = np.asarray(dates)

data_units = infer_datetime_units(dates)

if units is None:
units = infer_datetime_units(dates)
units = data_units
else:
units = _cleanup_netcdf_time_units(units)

if calendar is None:
calendar = infer_calendar_name(dates)

delta, _ref_date = _unpack_netcdf_time_units(units)
try:
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
# parse with cftime instead
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

delta, ref_date = _unpack_delta_ref_date(units)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
delta_units = _netcdf_to_numpy_timeunit(delta)
time_delta = np.timedelta64(1, delta_units).astype("timedelta64[ns]")

# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)

# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
# check if times can be represented with given units
if data_units != units:
data_delta, data_ref_date = _unpack_delta_ref_date(data_units)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
needed_delta = _infer_time_units_from_diff(
(data_ref_date - ref_date).to_timedelta64()
)
needed_time_delta = np.timedelta64(
1, _netcdf_to_numpy_timeunit(needed_delta)
).astype("timedelta64[ns]")
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
if needed_delta != delta and time_delta > needed_time_delta:
emit_user_level_warning(
f"Times can't be serialized faithfully with requested units {units!r}. "
f"Resolution of {needed_delta!r} needed. "
f"Serializing timeseries to floating point."
)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
Expand All @@ -668,8 +698,12 @@ def encode_cf_datetime(

# Use floor division if time_delta evenly divides all differences
# to preserve integer dtype if possible (GH 4045).
if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")):
num = time_deltas // time_delta
# NaT prevents us from using datetime64 directly, but we can safely coerce
# to int64 in presence of NaT, so we just dropna before check (GH 7817).
if np.all(time_deltas.dropna() % time_delta == np.timedelta64(0, "ns")):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
# calculate int64 floor division
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
num = time_deltas / time_delta
num = num.values.reshape(dates.shape)
Expand All @@ -686,8 +720,18 @@ def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarra
units = infer_timedelta_units(timedeltas)

np_unit = _netcdf_to_numpy_timeunit(units)
num = 1.0 * timedeltas / np.timedelta64(1, np_unit)
num = np.where(pd.isnull(timedeltas), np.nan, num)

time_delta = np.timedelta64(1, np_unit).astype("timedelta64[ns]")
time_deltas = pd.TimedeltaIndex(timedeltas.ravel())

if np.all(time_deltas.dropna() % time_delta == np.timedelta64(0, "ns")):
# calculate int64 floor division
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
num = time_deltas / time_delta
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
num = num.values.reshape(timedeltas.shape)

num = cast_to_int_if_safe(num)
return (num, units)

Expand All @@ -702,9 +746,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
) or contains_cftime_datetimes(variable):
dims, data, attrs, encoding = unpack_for_encoding(variable)

(data, units, calendar) = encode_cf_datetime(
data, encoding.pop("units", None), encoding.pop("calendar", None)
)
units = encoding.pop("units", None)
calendar = encoding.pop("calendar", None)
(data, units, calendar) = encode_cf_datetime(data, units, calendar)

safe_setitem(attrs, "units", units, name=name)
safe_setitem(attrs, "calendar", calendar, name=name)

Expand Down
38 changes: 35 additions & 3 deletions xarray/coding/variables.py
Expand Up @@ -215,6 +215,21 @@ def _apply_mask(
return np.where(condition, decoded_fill_value, data)


def _is_time_like(units):
# test for time-like
time_strings = [
"since",
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
return any(tstr in str(units) for tstr in time_strings)


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

Expand All @@ -236,19 +251,32 @@ def encode(self, variable: Variable, name: T_Name = None):
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

# special case DateTime to properly handle NaT
is_time_like = _is_time_like(attrs.get("units"))

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)

Expand All @@ -275,7 +303,11 @@ def decode(self, variable: Variable, name: T_Name = None):
stacklevel=3,
)

dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)

if encoded_fill_values:
transform = partial(
Expand Down
62 changes: 59 additions & 3 deletions xarray/tests/test_coding_times.py
Expand Up @@ -576,10 +576,10 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None:
("1ms", "milliseconds", np.int64(1)),
("1us", "microseconds", np.int64(1)),
("1ns", "nanoseconds", np.int64(1)),
(["NaT", "0s", "1s"], None, [np.nan, 0, 1]),
(["NaT", "0s", "1s"], None, [np.iinfo(np.int64).min, 0, 1]),
(["30m", "60m"], "hours", [0.5, 1.0]),
("NaT", "days", np.nan),
(["NaT", "NaT"], "days", [np.nan, np.nan]),
("NaT", "days", np.iinfo(np.int64).min),
(["NaT", "NaT"], "days", [np.iinfo(np.int64).min, np.iinfo(np.int64).min]),
],
)
def test_cf_timedelta(timedeltas, units, numbers) -> None:
Expand Down Expand Up @@ -1191,3 +1191,59 @@ def test_contains_cftime_lazy() -> None:
)
array = FirstElementAccessibleArray(times)
assert _contains_cftime_datetimes(array)


@pytest.mark.parametrize(
"time, dtype, fill_value",
[
(
np.datetime64("1677-09-21T00:12:43.145224193", "ns"),
np.int64,
20,
),
(
np.datetime64("1970-09-21T00:12:44.145224808", "ns"),
np.float64,
1e30,
),
(
np.datetime64("1677-09-21T00:12:43.145225216", "ns"),
np.float64,
-9.223372036854776e18,
),
],
)
def test_roundtrip_datetime64_nanosecond_precision(
time: np.datetime64, dtype: np.typing.DTypeLike, fill_value: int | float
) -> None:
# test for GH7817
times = [np.datetime64("1970-01-01", "ns"), np.datetime64("NaT"), time]
encoding = dict(dtype=dtype, _FillValue=fill_value)
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
var = Variable(["time"], times, encoding=encoding)

encoded_var = conventions.encode_cf_variable(var)
decoded_var = conventions.decode_cf_variable("foo", encoded_var)
assert_identical(var, decoded_var)


@pytest.mark.parametrize(
"dtype, fill_value",
[(np.int64, 20), (np.int64, np.iinfo(np.int64).min), (np.float64, 1e30)],
)
def test_roundtrip_timedelta64_nanosecond_precision(
dtype: np.typing.DTypeLike, fill_value: int | float
) -> None:
# test for GH7942
one_day = np.timedelta64(1, "ns")
nat = np.timedelta64("nat", "ns")
timedelta_values = (np.arange(5) * one_day).astype("timedelta64[ns]")
timedelta_values[2] = nat
timedelta_values[4] = nat

encoding = dict(dtype=dtype, _FillValue=fill_value)
var = Variable(["time"], timedelta_values, encoding=encoding)

encoded_var = conventions.encode_cf_variable(var)
decoded_var = conventions.decode_cf_variable("foo", encoded_var)

assert_identical(var, decoded_var)