From d66b5b56b7a10f1668fbba0ef58dfed7567ecc42 Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Fri, 23 Feb 2024 00:53:20 +0100 Subject: [PATCH] BUG: Fix near-minimum timestamp handling (#57314) * attempt failing test * expand test for demonstration purposes * fix near-minimum timestamp overflow when scaling from microseconds to nanoseconds * minor refactor * add comments around specifically handling near-minimum microsecond and nanosecond timestamps * consolidate comments --------- Co-authored-by: Robert Schmidtke --- doc/source/whatsnew/v2.2.1.rst | 1 + .../src/vendored/numpy/datetime/np_datetime.c | 18 ++++++++++++++---- pandas/tests/tslibs/test_array_to_datetime.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 35d64c99f2002..fa5304c1c3b56 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..277d01807f2f3 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -482,10 +482,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 30ea3a70552aa..89328e4bb0032 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -262,6 +262,23 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass