Merge branch 'main' into feature/44764_perf_issue_new

smarie · Jan 9, 2024 · 9cff856 · 9cff856
2 parents cc1a4d2 + c84f989
commit 9cff856
Show file tree

Hide file tree

Showing 355 changed files with 4,222 additions and 4,262 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,11 +18,6 @@ ci:
     # manual stage hooks
     skip: [pylint, pyright, mypy]
 repos:
--   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
-    # black compiled with mypyc
-    rev: 23.11.0
-    hooks:
-      - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.6
     hooks:
@@ -35,6 +30,9 @@ repos:
         files: ^pandas
         exclude: ^pandas/tests
         args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
+    -   id: ruff-format
+        # TODO: "." not needed in ruff 0.1.8
+        args: ["."]
 -   repo: https://github.com/jendrikseipp/vulture
     rev: 'v2.10'
     hooks:
@@ -274,13 +272,6 @@ repos:
         language: python
         types: [rst]
         files: ^doc/source/(development|reference)/
-    -   id: unwanted-patterns-bare-pytest-raises
-        name: Check for use of bare pytest raises
-        language: python
-        entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
-        types: [python]
-        files: ^pandas/tests/
-        exclude: ^pandas/tests/extension/
     -   id: unwanted-patterns-private-function-across-module
         name: Check for use of private functions across modules
         language: python

diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,8 @@ RUN apt-get update && apt-get -y upgrade
 RUN apt-get install -y build-essential
 
 # hdf5 needed for pytables installation
-RUN apt-get install -y libhdf5-dev
+# libgles2-mesa needed for pytest-qt
+RUN apt-get install -y libhdf5-dev libgles2-mesa-dev
 
 RUN python -m pip install --upgrade pip
 RUN python -m pip install \

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -84,9 +84,7 @@ def time_loc_slice(self, index, index_structure):
 
 class NumericMaskedIndexing:
     monotonic_list = list(range(10**6))
-    non_monotonic_list = (
-        list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
-    )
+    non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
 
     params = [
         ("Int64", "UInt64", "Float64"),

diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py
@@ -76,7 +76,8 @@ def _style_format(self):
         # apply a formatting function
         # subset is flexible but hinders vectorised solutions
         self.st = self.df.style.format(
-            "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"]
+            "{:,.3f}",
+            subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"],
         )
 
     def _style_apply_format_hide(self):

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -16,12 +16,18 @@
 
 set -uo pipefail
 
-[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
+if [[ -v 1 ]]; then
+    CHECK=$1
+else
+    # script will fail if it uses an unset variable (i.e. $1 is not provided)
+    CHECK=""
+fi
+
+[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
     { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
 
 BASE_DIR="$(dirname $0)/.."
 RET=0
-CHECK=$1
 
 ### CODE ###
 if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -38,7 +38,7 @@ Pre-commit
 ----------
 
 Additionally, :ref:`Continuous Integration <contributing.ci>` will run code formatting checks
-like ``black``, ``ruff``,
+like ``ruff``,
 ``isort``, and ``clang-format`` and more using `pre-commit hooks <https://pre-commit.com/>`_.
 Any warnings from these checks will cause the :ref:`Continuous Integration <contributing.ci>` to fail; therefore,
 it is helpful to run the check yourself before submitting code. This

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
    :toctree: api/
 
    Series.align
+   Series.case_when
    Series.drop
    Series.droplevel
    Series.drop_duplicates

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel.  Generally the semantics are
 similar to working with :ref:`csv<io.read_csv_table>` data.
 See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
 
-.. warning::
-
-   The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
-   old-style ``.xls`` files.
+.. note::
 
-   Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
-   would result in using the ``xlrd`` engine in many cases, including new
-   Excel 2007+ (``.xlsx``) files. pandas will now default to using the
-   `openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ engine.
+   When ``engine=None``, the following logic will be used to determine the engine:
 
-   It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
-   (``.xlsx``) files.
-   **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
-   This is no longer supported, switch to using ``openpyxl`` instead.
+   - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+     then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+   - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
+   - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
+   - Otherwise ``openpyxl`` will be used.
 
 .. _io.excel_reader:
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d
 
   In [3]: ser[0] = 'not an int64'
   FutureWarning:
-    Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.
+    Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas.
     Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 
   In [4]: ser

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -199,6 +199,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
 Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
 documentation.
 
+.. _whatsnew_220.enhancements.case_when:
+
+Create a pandas Series based on one or more conditions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`)
+
+.. ipython:: python
+
+   import pandas as pd
+
+   df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
+   default=pd.Series('default', index=df.index)
+   default.case_when(
+        caselist=[
+            (df.a == 1, 'first'),                              # condition, replacement
+            (df.a.gt(1) & df.b.eq(5), 'second'),  # condition, replacement
+        ],
+   )
+
 .. _whatsnew_220.enhancements.to_numpy_ea:
 
 ``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype
@@ -262,6 +282,14 @@ DataFrame. (:issue:`54938`)
     )
     series.struct.explode()
 
+Use :meth:`Series.struct.field` to index into a (possible nested)
+struct field.
+
+
+.. ipython:: python
+
+    series.struct.field("project")
+
 .. _whatsnew_220.enhancements.list_accessor:
 
 Series.list accessor for PyArrow list data
@@ -324,7 +352,7 @@ Other enhancements
 - :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
 - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
 - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
-- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
+- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
 - Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
 - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
 - Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
@@ -743,6 +771,7 @@ Categorical
 ^^^^^^^^^^^
 - :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`)
 - Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`)
+- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`)
 
 Datetimelike
 ^^^^^^^^^^^^
@@ -788,8 +817,11 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
+- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
 - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
 - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`)
+- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`)
 
 Conversion
 ^^^^^^^^^^
@@ -808,13 +840,15 @@ Strings
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
 - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
 - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
+- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
 - Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`)
 
 Interval
 ^^^^^^^^
 - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`)
 - Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`)
 - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
+- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`)
 - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
 - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
 - Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
@@ -846,6 +880,7 @@ I/O
 - Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
 - Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`)
 - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
+- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`)
 - Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`)
 - Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
 - Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
@@ -873,6 +908,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
 - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
 - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`)
+- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`)
 - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
@@ -888,6 +924,7 @@ Reshaping
 - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
 - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
 - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
+- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
 - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
 - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
 - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
@@ -901,6 +938,7 @@ Sparse
 
 Other
 ^^^^^
+- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
 - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
 - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -92,7 +92,8 @@ Other API changes
 
 Deprecations
 ~~~~~~~~~~~~
--
+- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
+- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -108,6 +109,8 @@ Performance improvements
 
 Bug fixes
 ~~~~~~~~~
+- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
+
 
 Categorical
 ^^^^^^^^^^^

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -2,6 +2,7 @@ from typing import (
     Any,
     Hashable,
     Literal,
+    overload,
 )
 
 import numpy as np
@@ -180,18 +181,33 @@ class HashTable:
         na_value: object = ...,
         mask=...,
     ) -> npt.NDArray[np.intp]: ...
+    @overload
     def unique(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
-        return_inverse: bool = ...,
-        mask=...,
-    ) -> (
-        tuple[
-            np.ndarray,  # np.ndarray[subclass-specific]
-            npt.NDArray[np.intp],
-        ]
-        | np.ndarray
-    ): ...  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[False] = ...,
+        mask: None = ...,
+    ) -> np.ndarray: ...  # np.ndarray[subclass-specific]
+    @overload
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[True],
+        mask: None = ...,
+    ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
+    @overload
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[False] = ...,
+        mask: npt.NDArray[np.bool_],
+    ) -> tuple[
+        np.ndarray,
+        npt.NDArray[np.bool_],
+    ]: ...  # np.ndarray[subclass-specific]
     def factorize(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -755,7 +755,7 @@ cdef class {{name}}HashTable(HashTable):
             return uniques.to_array(), result_mask.to_array()
         return uniques.to_array()
 
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
+    def unique(self, const {{dtype}}_t[:] values, *, bint return_inverse=False, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1180,7 +1180,7 @@ cdef class StringHashTable(HashTable):
             return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
-    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
+    def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1438,7 +1438,7 @@ cdef class PyObjectHashTable(HashTable):
             return uniques.to_array(), labels.base  # .base -> underlying ndarray
         return uniques.to_array()
 
-    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
+    def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -179,7 +179,8 @@ def indices_fast(
     sorted_labels: list[npt.NDArray[np.int64]],
 ) -> dict[Hashable, npt.NDArray[np.intp]]: ...
 def generate_slices(
-    labels: np.ndarray, ngroups: int  # const intp_t[:]
+    labels: np.ndarray,
+    ngroups: int,  # const intp_t[:]
 ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
 def count_level_2d(
     mask: np.ndarray,  # ndarray[uint8_t, ndim=2, cast=True],
@@ -209,5 +210,6 @@ def get_reverse_indexer(
 def is_bool_list(obj: list) -> bool: ...
 def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
 def is_range_indexer(
-    left: np.ndarray, n: int  # np.ndarray[np.int64, ndim=1]
+    left: np.ndarray,
+    n: int,  # np.ndarray[np.int64, ndim=1]
 ) -> bool: ...