Merge branch 'main' of https://github.com/pandas-dev/pandas into feat…

…ure/44764_perf_issue_new
smarie · Apr 3, 2024 · 338404c · 338404c
2 parents 847a9f3 + 05ab1af
commit 338404c
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 50 deletions.
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -139,8 +139,7 @@ jobs:
         shell: bash -el {0}
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
-      - name: Build normal wheels
-        if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
+      - name: Build wheels
         uses: pypa/cibuildwheel@v2.17.0
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -304,6 +304,8 @@ Performance improvements
 
 - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`)
 - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`)
+- :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`)
+- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
 - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
 - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
 - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)

diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
@@ -231,7 +231,7 @@ def __get__(self, obj, cls):
         return accessor_obj
 
 
-@doc(klass="", others="")
+@doc(klass="", examples="", others="")
 def _register_accessor(name: str, cls):
     """
     Register a custom accessor on {klass} objects.
@@ -255,51 +255,26 @@ def _register_accessor(name: str, cls):
 
     Notes
     -----
-    When accessed, your accessor will be initialized with the pandas object
-    the user is interacting with. So the signature must be
+    This function allows you to register a custom-defined accessor class for {klass}.
+    The requirements for the accessor class are as follows:
 
-    .. code-block:: python
+    * Must contain an init method that:
 
-        def __init__(self, pandas_object):  # noqa: E999
-            ...
+      * accepts a single {klass} object
 
-    For consistency with pandas methods, you should raise an ``AttributeError``
-    if the data passed to your accessor has an incorrect dtype.
+      * raises an AttributeError if the {klass} object does not have correctly
+        matching inputs for the accessor
 
-    >>> pd.Series(["a", "b"]).dt
-    Traceback (most recent call last):
-    ...
-    AttributeError: Can only use .dt accessor with datetimelike values
+    * Must contain a method for each access pattern.
 
-    Examples
-    --------
-    In your library code::
-
-        @pd.api.extensions.register_dataframe_accessor("geo")
-        class GeoAccessor:
-            def __init__(self, pandas_obj):
-                self._obj = pandas_obj
-
-            @property
-            def center(self):
-                # return the geographic center point of this DataFrame
-                lat = self._obj.latitude
-                lon = self._obj.longitude
-                return (float(lon.mean()), float(lat.mean()))
+      * The methods should be able to take any argument signature.
 
-            def plot(self):
-                # plot this array's data on a map, e.g., using Cartopy
-                pass
+      * Accessible using the @property decorator if no additional arguments are
+        needed.
 
-    Back in an interactive IPython session:
-
-        .. code-block:: ipython
-
-            In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
-               ...:                    "latitude": np.linspace(0, 20)}})
-            In [2]: ds.geo.center
-            Out[2]: (5.0, 10.0)
-            In [3]: ds.geo.plot()  # plots data on a map
+    Examples
+    --------
+    {examples}
     """
 
     def decorator(accessor):
@@ -318,21 +293,98 @@ def decorator(accessor):
     return decorator
 
 
-@doc(_register_accessor, klass="DataFrame")
+_register_df_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_dataframe_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns):
+...             raise AttributeError("All columns must contain integer values only")
+...         self._obj = pandas_obj
+...
+...     def sum(self):
+...         return self._obj.sum()
+...
+>>> df = pd.DataFrame([[1, 2], ['x', 'y']])
+>>> df.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: All columns must contain integer values only.
+>>> df = pd.DataFrame([[1, 2], [3, 4]])
+>>> df.int_accessor.sum()
+0    4
+1    6
+dtype: int64"""
+
+
+@doc(_register_accessor, klass="DataFrame", examples=_register_df_examples)
 def register_dataframe_accessor(name: str):
     from pandas import DataFrame
 
     return _register_accessor(name, DataFrame)
 
 
-@doc(_register_accessor, klass="Series")
+_register_series_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_series_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not pandas_obj.dtype == 'int64':
+...             raise AttributeError("The series must contain integer data only")
+...         self._obj = pandas_obj
+...
+...     def sum(self):
+...         return self._obj.sum()
+...
+>>> df = pd.Series([1, 2, 'x'])
+>>> df.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: The series must contain integer data only.
+>>> df = pd.Series([1, 2, 3])
+>>> df.int_accessor.sum()
+6"""
+
+
+@doc(_register_accessor, klass="Series", examples=_register_series_examples)
 def register_series_accessor(name: str):
     from pandas import Series
 
     return _register_accessor(name, Series)
 
 
-@doc(_register_accessor, klass="Index")
+_register_index_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_index_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not all(isinstance(x, int) for x in pandas_obj):
+...             raise AttributeError("The index must only be an integer value")
+...         self._obj = pandas_obj
+...
+...     def even(self):
+...         return [x for x in self._obj if x % 2 == 0]
+>>> df = pd.DataFrame.from_dict(
+...     {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index"
+... )
+>>> df.index.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: The index must only be an integer value.
+>>> df = pd.DataFrame(
+...     {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8]
+... )
+>>> df.index.int_accessor.even()
+[2, 8]"""
+
+
+@doc(_register_accessor, klass="Index", examples=_register_index_examples)
 def register_index_accessor(name: str):
     from pandas import Index
 

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -7144,7 +7144,10 @@ def maybe_sequence_to_range(sequence) -> Any | range:
         return sequence
     if len(sequence) == 0:
         return range(0)
-    np_sequence = np.asarray(sequence, dtype=np.int64)
+    try:
+        np_sequence = np.asarray(sequence, dtype=np.int64)
+    except OverflowError:
+        return sequence
     diff = np_sequence[1] - np_sequence[0]
     if diff == 0:
         return sequence

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -518,8 +518,11 @@ def _sanitize_mixed_ndim(
                         # to have unique names
                         name = current_column
                         current_column += 1
-
-                obj = sample._constructor({name: obj}, copy=False)
+                    obj = sample._constructor(obj, copy=False)
+                    if isinstance(obj, ABCDataFrame):
+                        obj.columns = range(name, name + 1, 1)
+                else:
+                    obj = sample._constructor({name: obj}, copy=False)
 
             new_objs.append(obj)
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -132,6 +132,7 @@
     PeriodIndex,
     default_index,
     ensure_index,
+    maybe_sequence_to_range,
 )
 import pandas.core.indexes.base as ibase
 from pandas.core.indexes.multi import maybe_droplevels
@@ -538,16 +539,14 @@ def _init_dict(
         _data : BlockManager for the new Series
         index : index for the new Series
         """
-        keys: Index | tuple
-
         # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
         # raises KeyError), so we iterate the entire dict, and align
         if data:
             # GH:34717, issue was using zip to extract key and values from data.
             # using generators in effects the performance.
             # Below is the new way of extracting the keys and values
 
-            keys = tuple(data.keys())
+            keys = maybe_sequence_to_range(tuple(data.keys()))
             values = list(data.values())  # Generating list of values- faster way
         elif index is not None:
             # fastpath for Series(data=None). Just use broadcasting a scalar

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -912,3 +912,11 @@ def test_concat_none_with_timezone_timestamp():
         result = concat([df1, df2], ignore_index=True)
     expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_concat_with_series_and_frame_returns_rangeindex_columns():
+    ser = Series([0])
+    df = DataFrame([1, 2])
+    result = concat([ser, df])
+    expected = DataFrame([0, 1, 2], index=[0, 0, 1])
+    tm.assert_frame_equal(result, expected, check_column_type=True)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2251,3 +2251,9 @@ def test_series_with_complex_nan(input_list):
     result = Series(ser.array)
     assert ser.dtype == "complex128"
     tm.assert_series_equal(ser, result)
+
+
+def test_dict_keys_rangeindex():
+    result = Series({0: 1, 1: 2})
+    expected = Series([1, 2], index=RangeIndex(2))
+    tm.assert_series_equal(result, expected, check_index_type=True)