Merge branch 'main' into feature/44764_perf_issue_new

smarie · Jan 13, 2024 · 1726096 · 1726096
2 parents 404ab84 + 612823e
commit 1726096
Show file tree

Hide file tree

Showing 144 changed files with 1,210 additions and 1,050 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -71,11 +71,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
     MSG='Partially validate docstrings (EX03)' ;  echo $MSG
     $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
-        pandas.Series.dt.day_name \
-        pandas.Series.str.len \
-        pandas.Series.cat.set_categories \
-        pandas.Series.plot.bar \
-        pandas.Series.plot.hist \
         pandas.Series.plot.line \
         pandas.Series.to_sql \
         pandas.Series.to_latex \
@@ -106,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.DataFrame.to_sql \
         pandas.read_stata \
         pandas.core.resample.Resampler.pipe \
-        pandas.core.resample.Resampler.fillna \
         pandas.core.resample.Resampler.interpolate \
         pandas.plotting.scatter_matrix \
         pandas.pivot \
@@ -128,11 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.core.groupby.SeriesGroupBy.transform \
         pandas.core.groupby.SeriesGroupBy.pipe \
         pandas.core.groupby.DataFrameGroupBy.pipe \
-        pandas.core.groupby.DataFrameGroupBy.describe \
-        pandas.core.groupby.DataFrameGroupBy.idxmax \
-        pandas.core.groupby.DataFrameGroupBy.idxmin \
-        pandas.core.groupby.DataFrameGroupBy.value_counts \
-        pandas.core.groupby.SeriesGroupBy.describe \
         pandas.core.groupby.DataFrameGroupBy.boxplot \
         pandas.core.groupby.DataFrameGroupBy.hist \
         pandas.io.formats.style.Styler.map \
@@ -151,22 +140,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.io.formats.style.Styler.background_gradient \
         pandas.io.formats.style.Styler.text_gradient \
         pandas.DataFrame.values \
-        pandas.DataFrame.loc \
-        pandas.DataFrame.iloc \
         pandas.DataFrame.groupby \
-        pandas.DataFrame.describe \
         pandas.DataFrame.skew \
         pandas.DataFrame.var \
         pandas.DataFrame.idxmax \
         pandas.DataFrame.idxmin \
-        pandas.DataFrame.last \
         pandas.DataFrame.pivot \
         pandas.DataFrame.sort_values \
         pandas.DataFrame.tz_convert \
         pandas.DataFrame.tz_localize \
         pandas.DataFrame.plot.bar \
         pandas.DataFrame.plot.hexbin \
-        pandas.DataFrame.plot.hist \
         pandas.DataFrame.plot.line \
         pandas.DataFrame.hist \
     RET=$(($RET + $?)) ; echo $MSG "DONE"

diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
@@ -939,7 +939,7 @@ Each shared docstring will have a base template with variables, like
 Finally, docstrings can also be appended to with the ``doc`` decorator.
 
 In this example, we'll create a parent docstring normally (this is like
-``pandas.core.generic.NDFrame``. Then we'll have two children (like
+``pandas.core.generic.NDFrame``). Then we'll have two children (like
 ``pandas.core.series.Series`` and ``pandas.core.frame.DataFrame``). We'll
 substitute the class names in this docstring.
 

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -354,6 +354,7 @@ Other enhancements
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
 - :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
 - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
+- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`)
 - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
 - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
 - Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
@@ -672,14 +673,15 @@ Other Deprecations
 - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`)
 - Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`)
 - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`)
-- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`)
 - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`)
 - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`)
+- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`)
 - Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`)
 - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`)
 - Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`)
 - Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`)
 - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`)
+- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`)
 - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`)
 - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`)
 - Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`)
@@ -730,6 +732,7 @@ Other Deprecations
 - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
 - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`)
 - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.performance:
@@ -828,6 +831,7 @@ Conversion
 - Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`)
 - Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`)
 - Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
+- Bug in ``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 <https://pandas.pydata.org/pdeps/0006-ban-upcasting.html>`_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
 
 Strings
 ^^^^^^^
@@ -943,14 +947,15 @@ Other
 - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
 - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
+- Bug in :func:`pd.api.interchange.from_dataframe` where it raised  ``NotImplementedError`` when handling empty string columns (:issue:`56703`)
 - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
 - Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`.  (:issue:`55683`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`)
 - Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
 - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
 - Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`)
 - Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`)
-
+- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`pandas.core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.contributors:

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -101,7 +101,9 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
--
+- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
+- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
+- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -119,7 +121,7 @@ Categorical
 
 Datetimelike
 ^^^^^^^^^^^^
--
+- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
 -
 
 Timedelta

diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -756,18 +756,27 @@ cdef class BaseOffset:
         raise ValueError(f"{self} is a non-fixed frequency")
 
     def is_anchored(self) -> bool:
-        # TODO: Does this make sense for the general case?  It would help
-        # if there were a canonical docstring for what is_anchored means.
+        # GH#55388
         """
         Return boolean whether the frequency is a unit frequency (n=1).
 
+        .. deprecated:: 2.2.0
+            is_anchored is deprecated and will be removed in a future version.
+            Use ``obj.n == 1`` instead.
+
         Examples
         --------
         >>> pd.DateOffset().is_anchored()
         True
         >>> pd.DateOffset(2).is_anchored()
         False
         """
+        warnings.warn(
+            f"{type(self).__name__}.is_anchored is deprecated and will be removed "
+            f"in a future version, please use \'obj.n == 1\' instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return self.n == 1
 
     # ------------------------------------------------------------------
@@ -954,6 +963,27 @@ cdef class Tick(SingleConstructorOffset):
         return True
 
     def is_anchored(self) -> bool:
+        # GH#55388
+        """
+        Return False.
+
+        .. deprecated:: 2.2.0
+            is_anchored is deprecated and will be removed in a future version.
+            Use ``False`` instead.
+
+        Examples
+        --------
+        >>> pd.offsets.Hour().is_anchored()
+        False
+        >>> pd.offsets.Hour(2).is_anchored()
+        False
+        """
+        warnings.warn(
+            f"{type(self).__name__}.is_anchored is deprecated and will be removed "
+            f"in a future version, please use False instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return False
 
     # This is identical to BaseOffset.__hash__, but has to be redefined here
@@ -2663,6 +2693,13 @@ cdef class QuarterOffset(SingleConstructorOffset):
         return f"{self._prefix}-{month}"
 
     def is_anchored(self) -> bool:
+        warnings.warn(
+            f"{type(self).__name__}.is_anchored is deprecated and will be removed "
+            f"in a future version, please use \'obj.n == 1 "
+            f"and obj.startingMonth is not None\' instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return self.n == 1 and self.startingMonth is not None
 
     def is_on_offset(self, dt: datetime) -> bool:
@@ -3308,6 +3345,13 @@ cdef class Week(SingleConstructorOffset):
         self._cache = state.pop("_cache", {})
 
     def is_anchored(self) -> bool:
+        warnings.warn(
+            f"{type(self).__name__}.is_anchored is deprecated and will be removed "
+            f"in a future version, please use \'obj.n == 1 "
+            f"and obj.weekday is not None\' instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return self.n == 1 and self.weekday is not None
 
     @apply_wraps
@@ -3597,6 +3641,12 @@ cdef class FY5253Mixin(SingleConstructorOffset):
         self.variation = state.pop("variation")
 
     def is_anchored(self) -> bool:
+        warnings.warn(
+            f"{type(self).__name__}.is_anchored is deprecated and will be removed "
+            f"in a future version, please use \'obj.n == 1\' instead.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
         return (
             self.n == 1 and self.startingMonth is not None and self.weekday is not None
         )

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1973,4 +1973,7 @@ def warsaw(request) -> str:
 
 @pytest.fixture
 def arrow_string_storage():
+    """
+    Fixture that lists possible PyArrow values for StringDtype storage field.
+    """
     return ("pyarrow", "pyarrow_numpy")
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -184,6 +184,7 @@ def floordiv_compat(
         AxisInt,
         Dtype,
         FillnaOptions,
+        InterpolateOptions,
         Iterator,
         NpDtype,
         NumpySorter,
@@ -2068,6 +2069,45 @@ def _maybe_convert_setitem_value(self, value):
             raise TypeError(msg) from err
         return value
 
+    def interpolate(
+        self,
+        *,
+        method: InterpolateOptions,
+        axis: int,
+        index,
+        limit,
+        limit_direction,
+        limit_area,
+        copy: bool,
+        **kwargs,
+    ) -> Self:
+        """
+        See NDFrame.interpolate.__doc__.
+        """
+        # NB: we return type(self) even if copy=False
+        mask = self.isna()
+        if self.dtype.kind == "f":
+            data = self._pa_array.to_numpy()
+        elif self.dtype.kind in "iu":
+            data = self.to_numpy(dtype="f8", na_value=0.0)
+        else:
+            raise NotImplementedError(
+                f"interpolate is not implemented for dtype={self.dtype}"
+            )
+
+        missing.interpolate_2d_inplace(
+            data,
+            method=method,
+            axis=0,
+            index=index,
+            limit=limit,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            mask=mask,
+            **kwargs,
+        )
+        return type(self)(self._box_pa_array(pa.array(data, mask=mask)))
+
     @classmethod
     def _if_else(
         cls,

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1082,7 +1082,7 @@ def set_categories(
         For :class:`pandas.Series`:
 
         >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'],
-        ...                           categories=['a', 'b', 'c'], ordered=True)
+        ...                          categories=['a', 'b', 'c'], ordered=True)
         >>> ser = pd.Series(raw_cat)
         >>> ser
         0   a

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -1370,7 +1370,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
         >>> idx
         DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
                       dtype='datetime64[ns]', freq='D')
-        >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
+        >>> idx.day_name(locale='pt_BR.utf8')  # doctest: +SKIP
         Index(['Segunda', 'Terça', 'Quarta'], dtype='object')
         """
         values = self._local_timestamps()
@@ -2780,11 +2780,6 @@ def _generate_range(
         # variable has type "Optional[Timestamp]")
         start = offset.rollforward(start)  # type: ignore[assignment]
 
-    elif end and not offset.is_on_offset(end):
-        # Incompatible types in assignment (expression has type "datetime",
-        # variable has type "Optional[Timestamp]")
-        end = offset.rollback(end)  # type: ignore[assignment]
-
     # Unsupported operand types for < ("Timestamp" and "None")
     if periods is None and end < start and offset.n >= 0:  # type: ignore[operator]
         end = None