Skip to content

Commit

Permalink
Merge branch 'main' into feature/44764_perf_issue_new
Browse files Browse the repository at this point in the history
  • Loading branch information
smarie committed Jan 9, 2024
2 parents cc1a4d2 + c84f989 commit 9cff856
Show file tree
Hide file tree
Showing 355 changed files with 4,222 additions and 4,262 deletions.
15 changes: 3 additions & 12 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ ci:
# manual stage hooks
skip: [pylint, pyright, mypy]
repos:
- repo: https://github.com/hauntsaninja/black-pre-commit-mirror
# black compiled with mypyc
rev: 23.11.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
hooks:
Expand All @@ -35,6 +30,9 @@ repos:
files: ^pandas
exclude: ^pandas/tests
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
- id: ruff-format
# TODO: "." not needed in ruff 0.1.8
args: ["."]
- repo: https://github.com/jendrikseipp/vulture
rev: 'v2.10'
hooks:
Expand Down Expand Up @@ -274,13 +272,6 @@ repos:
language: python
types: [rst]
files: ^doc/source/(development|reference)/
- id: unwanted-patterns-bare-pytest-raises
name: Check for use of bare pytest raises
language: python
entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises"
types: [python]
files: ^pandas/tests/
exclude: ^pandas/tests/extension/
- id: unwanted-patterns-private-function-across-module
name: Check for use of private functions across modules
language: python
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ RUN apt-get update && apt-get -y upgrade
RUN apt-get install -y build-essential

# hdf5 needed for pytables installation
RUN apt-get install -y libhdf5-dev
# libgles2-mesa needed for pytest-qt
RUN apt-get install -y libhdf5-dev libgles2-mesa-dev

RUN python -m pip install --upgrade pip
RUN python -m pip install \
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def time_loc_slice(self, index, index_structure):

class NumericMaskedIndexing:
monotonic_list = list(range(10**6))
non_monotonic_list = (
list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
)
non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))

params = [
("Int64", "UInt64", "Float64"),
Expand Down
3 changes: 2 additions & 1 deletion asv_bench/benchmarks/io/style.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def _style_format(self):
# apply a formatting function
# subset is flexible but hinders vectorised solutions
self.st = self.df.style.format(
"{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"]
"{:,.3f}",
subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"],
)

def _style_apply_format_hide(self):
Expand Down
10 changes: 8 additions & 2 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@

set -uo pipefail

[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
if [[ -v 1 ]]; then
CHECK=$1
else
# script will fail if it uses an unset variable (i.e. $1 is not provided)
CHECK=""
fi

[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \
{ echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }

BASE_DIR="$(dirname $0)/.."
RET=0
CHECK=$1

### CODE ###
if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion doc/source/development/contributing_codebase.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Pre-commit
----------

Additionally, :ref:`Continuous Integration <contributing.ci>` will run code formatting checks
like ``black``, ``ruff``,
like ``ruff``,
``isort``, and ``clang-format`` and more using `pre-commit hooks <https://pre-commit.com/>`_.
Any warnings from these checks will cause the :ref:`Continuous Integration <contributing.ci>` to fail; therefore,
it is helpful to run the check yourself before submitting code. This
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
:toctree: api/

Series.align
Series.case_when
Series.drop
Series.droplevel
Series.drop_duplicates
Expand Down
19 changes: 7 additions & 12 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3471,20 +3471,15 @@ saving a ``DataFrame`` to Excel. Generally the semantics are
similar to working with :ref:`csv<io.read_csv_table>` data.
See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.

.. warning::

The `xlrd <https://xlrd.readthedocs.io/en/latest/>`__ package is now only for reading
old-style ``.xls`` files.
.. note::

Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel`
would result in using the ``xlrd`` engine in many cases, including new
Excel 2007+ (``.xlsx``) files. pandas will now default to using the
`openpyxl <https://openpyxl.readthedocs.io/en/stable/>`__ engine.
When ``engine=None``, the following logic will be used to determine the engine:

It is strongly encouraged to install ``openpyxl`` to read Excel 2007+
(``.xlsx``) files.
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
This is no longer supported, switch to using ``openpyxl`` instead.
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used.
- Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used.
- Otherwise ``openpyxl`` will be used.

.. _io.excel_reader:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ In a future version, these will raise an error and you should cast to a common d
In [3]: ser[0] = 'not an int64'
FutureWarning:
Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.
Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas.
Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
In [4]: ser
Expand Down
40 changes: 39 additions & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,26 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
documentation.

.. _whatsnew_220.enhancements.case_when:

Create a pandas Series based on one or more conditions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`)

.. ipython:: python
import pandas as pd
df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6]))
default=pd.Series('default', index=df.index)
default.case_when(
caselist=[
(df.a == 1, 'first'), # condition, replacement
(df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement
],
)
.. _whatsnew_220.enhancements.to_numpy_ea:

``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype
Expand Down Expand Up @@ -262,6 +282,14 @@ DataFrame. (:issue:`54938`)
)
series.struct.explode()
Use :meth:`Series.struct.field` to index into a (possible nested)
struct field.


.. ipython:: python
series.struct.field("project")
.. _whatsnew_220.enhancements.list_accessor:

Series.list accessor for PyArrow list data
Expand Down Expand Up @@ -324,7 +352,7 @@ Other enhancements
- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
Expand Down Expand Up @@ -743,6 +771,7 @@ Categorical
^^^^^^^^^^^
- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`)
- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`)
- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`)

Datetimelike
^^^^^^^^^^^^
Expand Down Expand Up @@ -788,8 +817,11 @@ Timezones
Numeric
^^^^^^^
- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`)
- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`)
- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`)
- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`)
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`)
- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`)

Conversion
^^^^^^^^^^
Expand All @@ -808,13 +840,15 @@ Strings
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
- Bug in :meth:`str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
- Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`)

Interval
^^^^^^^^
- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`)
- Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`)
- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`)
- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`)
- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`)
- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`)
- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`)
Expand Down Expand Up @@ -846,6 +880,7 @@ I/O
- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`)
- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`)
- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`)
- Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`)
- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`)
Expand Down Expand Up @@ -873,6 +908,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
- Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`)
- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`)
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
Expand All @@ -888,6 +924,7 @@ Reshaping
- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
Expand All @@ -901,6 +938,7 @@ Sparse

Other
^^^^^
- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`)
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
Expand Down
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ Other API changes

Deprecations
~~~~~~~~~~~~
-
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
-

.. ---------------------------------------------------------------------------
Expand All @@ -108,6 +109,8 @@ Performance improvements

Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)


Categorical
^^^^^^^^^^^
Expand Down
34 changes: 25 additions & 9 deletions pandas/_libs/hashtable.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ from typing import (
Any,
Hashable,
Literal,
overload,
)

import numpy as np
Expand Down Expand Up @@ -180,18 +181,33 @@ class HashTable:
na_value: object = ...,
mask=...,
) -> npt.NDArray[np.intp]: ...
@overload
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
return_inverse: bool = ...,
mask=...,
) -> (
tuple[
np.ndarray, # np.ndarray[subclass-specific]
npt.NDArray[np.intp],
]
| np.ndarray
): ... # np.ndarray[subclass-specific]
*,
return_inverse: Literal[False] = ...,
mask: None = ...,
) -> np.ndarray: ... # np.ndarray[subclass-specific]
@overload
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
*,
return_inverse: Literal[True],
mask: None = ...,
) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific]
@overload
def unique(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
*,
return_inverse: Literal[False] = ...,
mask: npt.NDArray[np.bool_],
) -> tuple[
np.ndarray,
npt.NDArray[np.bool_],
]: ... # np.ndarray[subclass-specific]
def factorize(
self,
values: np.ndarray, # np.ndarray[subclass-specific]
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ cdef class {{name}}HashTable(HashTable):
return uniques.to_array(), result_mask.to_array()
return uniques.to_array()

def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
def unique(self, const {{dtype}}_t[:] values, *, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -1180,7 +1180,7 @@ cdef class StringHashTable(HashTable):
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -1438,7 +1438,7 @@ cdef class PyObjectHashTable(HashTable):
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
def unique(self, ndarray[object] values, *, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ def indices_fast(
sorted_labels: list[npt.NDArray[np.int64]],
) -> dict[Hashable, npt.NDArray[np.intp]]: ...
def generate_slices(
labels: np.ndarray, ngroups: int # const intp_t[:]
labels: np.ndarray,
ngroups: int, # const intp_t[:]
) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
def count_level_2d(
mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
Expand Down Expand Up @@ -209,5 +210,6 @@ def get_reverse_indexer(
def is_bool_list(obj: list) -> bool: ...
def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
def is_range_indexer(
left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1]
left: np.ndarray,
n: int, # np.ndarray[np.int64, ndim=1]
) -> bool: ...

0 comments on commit 9cff856

Please sign in to comment.