Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…
Browse files Browse the repository at this point in the history
…ure/44764_perf_issue_new
  • Loading branch information
Sylvain MARIE committed Apr 3, 2024
2 parents 847a9f3 + 05ab1af commit 338404c
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 50 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ jobs:
shell: bash -el {0}
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"

- name: Build normal wheels
if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
- name: Build wheels
uses: pypa/cibuildwheel@v2.17.0
with:
package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ Performance improvements

- :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`)
- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`)
- :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`)
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
Expand Down
136 changes: 94 additions & 42 deletions pandas/core/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def __get__(self, obj, cls):
return accessor_obj


@doc(klass="", others="")
@doc(klass="", examples="", others="")
def _register_accessor(name: str, cls):
"""
Register a custom accessor on {klass} objects.
Expand All @@ -255,51 +255,26 @@ def _register_accessor(name: str, cls):
Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be
This function allows you to register a custom-defined accessor class for {klass}.
The requirements for the accessor class are as follows:
.. code-block:: python
* Must contain an init method that:
def __init__(self, pandas_object): # noqa: E999
...
* accepts a single {klass} object
For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.
* raises an AttributeError if the {klass} object does not have correctly
matching inputs for the accessor
>>> pd.Series(["a", "b"]).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values
* Must contain a method for each access pattern.
Examples
--------
In your library code::
@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj
@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))
* The methods should be able to take any argument signature.
def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass
* Accessible using the @property decorator if no additional arguments are
needed.
Back in an interactive IPython session:
.. code-block:: ipython
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map
Examples
--------
{examples}
"""

def decorator(accessor):
Expand All @@ -318,21 +293,98 @@ def decorator(accessor):
return decorator


@doc(_register_accessor, klass="DataFrame")
_register_df_examples = """
An accessor that only accepts integers could
have a class defined like this:
>>> @pd.api.extensions.register_dataframe_accessor("int_accessor")
... class IntAccessor:
... def __init__(self, pandas_obj):
... if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns):
... raise AttributeError("All columns must contain integer values only")
... self._obj = pandas_obj
...
... def sum(self):
... return self._obj.sum()
...
>>> df = pd.DataFrame([[1, 2], ['x', 'y']])
>>> df.int_accessor
Traceback (most recent call last):
...
AttributeError: All columns must contain integer values only.
>>> df = pd.DataFrame([[1, 2], [3, 4]])
>>> df.int_accessor.sum()
0 4
1 6
dtype: int64"""


@doc(_register_accessor, klass="DataFrame", examples=_register_df_examples)
def register_dataframe_accessor(name: str):
from pandas import DataFrame

return _register_accessor(name, DataFrame)


@doc(_register_accessor, klass="Series")
_register_series_examples = """
An accessor that only accepts integers could
have a class defined like this:
>>> @pd.api.extensions.register_series_accessor("int_accessor")
... class IntAccessor:
... def __init__(self, pandas_obj):
... if not pandas_obj.dtype == 'int64':
... raise AttributeError("The series must contain integer data only")
... self._obj = pandas_obj
...
... def sum(self):
... return self._obj.sum()
...
>>> df = pd.Series([1, 2, 'x'])
>>> df.int_accessor
Traceback (most recent call last):
...
AttributeError: The series must contain integer data only.
>>> df = pd.Series([1, 2, 3])
>>> df.int_accessor.sum()
6"""


@doc(_register_accessor, klass="Series", examples=_register_series_examples)
def register_series_accessor(name: str):
from pandas import Series

return _register_accessor(name, Series)


@doc(_register_accessor, klass="Index")
_register_index_examples = """
An accessor that only accepts integers could
have a class defined like this:
>>> @pd.api.extensions.register_index_accessor("int_accessor")
... class IntAccessor:
... def __init__(self, pandas_obj):
... if not all(isinstance(x, int) for x in pandas_obj):
... raise AttributeError("The index must only be an integer value")
... self._obj = pandas_obj
...
... def even(self):
... return [x for x in self._obj if x % 2 == 0]
>>> df = pd.DataFrame.from_dict(
... {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index"
... )
>>> df.index.int_accessor
Traceback (most recent call last):
...
AttributeError: The index must only be an integer value.
>>> df = pd.DataFrame(
... {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8]
... )
>>> df.index.int_accessor.even()
[2, 8]"""


@doc(_register_accessor, klass="Index", examples=_register_index_examples)
def register_index_accessor(name: str):
from pandas import Index

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7144,7 +7144,10 @@ def maybe_sequence_to_range(sequence) -> Any | range:
return sequence
if len(sequence) == 0:
return range(0)
np_sequence = np.asarray(sequence, dtype=np.int64)
try:
np_sequence = np.asarray(sequence, dtype=np.int64)
except OverflowError:
return sequence
diff = np_sequence[1] - np_sequence[0]
if diff == 0:
return sequence
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,11 @@ def _sanitize_mixed_ndim(
# to have unique names
name = current_column
current_column += 1

obj = sample._constructor({name: obj}, copy=False)
obj = sample._constructor(obj, copy=False)
if isinstance(obj, ABCDataFrame):
obj.columns = range(name, name + 1, 1)
else:
obj = sample._constructor({name: obj}, copy=False)

new_objs.append(obj)

Expand Down
5 changes: 2 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
PeriodIndex,
default_index,
ensure_index,
maybe_sequence_to_range,
)
import pandas.core.indexes.base as ibase
from pandas.core.indexes.multi import maybe_droplevels
Expand Down Expand Up @@ -538,16 +539,14 @@ def _init_dict(
_data : BlockManager for the new Series
index : index for the new Series
"""
keys: Index | tuple

# Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
# raises KeyError), so we iterate the entire dict, and align
if data:
# GH:34717, issue was using zip to extract key and values from data.
# using generators in effects the performance.
# Below is the new way of extracting the keys and values

keys = tuple(data.keys())
keys = maybe_sequence_to_range(tuple(data.keys()))
values = list(data.values()) # Generating list of values- faster way
elif index is not None:
# fastpath for Series(data=None). Just use broadcasting a scalar
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,3 +912,11 @@ def test_concat_none_with_timezone_timestamp():
result = concat([df1, df2], ignore_index=True)
expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
tm.assert_frame_equal(result, expected)


def test_concat_with_series_and_frame_returns_rangeindex_columns():
ser = Series([0])
df = DataFrame([1, 2])
result = concat([ser, df])
expected = DataFrame([0, 1, 2], index=[0, 0, 1])
tm.assert_frame_equal(result, expected, check_column_type=True)
6 changes: 6 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2251,3 +2251,9 @@ def test_series_with_complex_nan(input_list):
result = Series(ser.array)
assert ser.dtype == "complex128"
tm.assert_series_equal(ser, result)


def test_dict_keys_rangeindex():
result = Series({0: 1, 1: 2})
expected = Series([1, 2], index=RangeIndex(2))
tm.assert_series_equal(result, expected, check_index_type=True)

0 comments on commit 338404c

Please sign in to comment.