Merge remote-tracking branch 'upstream/master' into bisect

simonjayhawkins · Aug 25, 2021 · 704c32e · 704c32e
2 parents 641695f + e218f05
commit 704c32e
Show file tree

Hide file tree

Showing 43 changed files with 888 additions and 431 deletions.
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 from pandas import (
+    Categorical,
     DataFrame,
     MultiIndex,
     Series,
@@ -31,6 +32,9 @@ def setup(self):
         self.dict_list = frame.to_dict(orient="records")
         self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
 
+        # arrays which we wont consolidate
+        self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)}
+
     def time_list_of_dict(self):
         DataFrame(self.dict_list)
 
@@ -50,6 +54,10 @@ def time_nested_dict_int64(self):
         # nested dict, integer indexes, regression described in #621
         DataFrame(self.data2)
 
+    def time_dict_of_categoricals(self):
+        # dict of arrays that we wont consolidate
+        DataFrame(self.dict_of_categoricals)
+
 
 class FromSeries:
     def setup(self):

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -403,7 +403,7 @@ def time_srs_bfill(self):
 
 class GroupByMethods:
 
-    param_names = ["dtype", "method", "application"]
+    param_names = ["dtype", "method", "application", "ncols"]
     params = [
         ["int", "float", "object", "datetime", "uint"],
         [
@@ -443,15 +443,23 @@ class GroupByMethods:
             "var",
         ],
         ["direct", "transformation"],
+        [1, 2, 5, 10],
     ]
 
-    def setup(self, dtype, method, application):
+    def setup(self, dtype, method, application, ncols):
         if method in method_blocklist.get(dtype, {}):
             raise NotImplementedError  # skip benchmark
+
+        if ncols != 1 and method in ["value_counts", "unique"]:
+            # DataFrameGroupBy doesn't have these methods
+            raise NotImplementedError
+
         ngroups = 1000
         size = ngroups * 2
-        rng = np.arange(ngroups)
-        values = rng.take(np.random.randint(0, ngroups, size=size))
+        rng = np.arange(ngroups).reshape(-1, 1)
+        rng = np.broadcast_to(rng, (len(rng), ncols))
+        taker = np.random.randint(0, ngroups, size=size)
+        values = rng.take(taker, axis=0)
         if dtype == "int":
             key = np.random.randint(0, size, size=size)
         elif dtype == "uint":
@@ -465,22 +473,27 @@ def setup(self, dtype, method, application):
         elif dtype == "datetime":
             key = date_range("1/1/2011", periods=size, freq="s")
 
-        df = DataFrame({"values": values, "key": key})
+        cols = [f"values{n}" for n in range(ncols)]
+        df = DataFrame(values, columns=cols)
+        df["key"] = key
+
+        if len(cols) == 1:
+            cols = cols[0]
 
         if application == "transform":
             if method == "describe":
                 raise NotImplementedError
 
-            self.as_group_method = lambda: df.groupby("key")["values"].transform(method)
-            self.as_field_method = lambda: df.groupby("values")["key"].transform(method)
+            self.as_group_method = lambda: df.groupby("key")[cols].transform(method)
+            self.as_field_method = lambda: df.groupby(cols)["key"].transform(method)
         else:
-            self.as_group_method = getattr(df.groupby("key")["values"], method)
-            self.as_field_method = getattr(df.groupby("values")["key"], method)
+            self.as_group_method = getattr(df.groupby("key")[cols], method)
+            self.as_field_method = getattr(df.groupby(cols)["key"], method)
 
-    def time_dtype_as_group(self, dtype, method, application):
+    def time_dtype_as_group(self, dtype, method, application, ncols):
         self.as_group_method()
 
-    def time_dtype_as_field(self, dtype, method, application):
+    def time_dtype_as_field(self, dtype, method, application, ncols):
         self.as_field_method()
 
 

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -366,11 +366,20 @@ class InsertColumns:
     def setup(self):
         self.N = 10 ** 3
         self.df = DataFrame(index=range(self.N))
+        self.df2 = DataFrame(np.random.randn(self.N, 2))
 
     def time_insert(self):
         for i in range(100):
             self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)
 
+    def time_insert_middle(self):
+        # same as time_insert but inserting to a middle column rather than
+        #  front or back (which have fast-paths)
+        for i in range(100):
+            self.df2.insert(
+                1, "colname", np.random.randn(self.N), allow_duplicates=True
+            )
+
     def time_assign_with_setitem(self):
         for i in range(100):
             self.df[i] = np.random.randn(self.N)

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -31,6 +31,7 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.2
+  - aiobotocore<=1.3.3
   - scipy
   - sqlalchemy
   - xlrd

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -30,6 +30,7 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.0
+  - aiobotocore<=1.3.3
   - scipy
   - xlrd
   - xlsxwriter

diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml
@@ -32,6 +32,7 @@ dependencies:
   - python-dateutil
   - pytz
   - s3fs>=0.4.2
+  - aiobotocore<=1.3.3
   - scipy
   - sqlalchemy
   - xlrd

diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -233,6 +233,12 @@ default, :meth:`~pandas.DataFrame.join` will join the DataFrames on their indice
 parameters allowing you to specify the type of join to perform (``LEFT``, ``RIGHT``, ``INNER``,
 ``FULL``) or the columns to join on (column names or indices).
 
+.. warning::
+
+    If both key columns contain rows where the key is a null value, those
+    rows will be matched against each other. This is different from usual SQL
+    join behaviour and can lead to unexpected results.
+
 .. ipython:: python
 
     df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -63,6 +63,21 @@ In Pandas 2.0, :class:`NumericIndex` will become the default numeric index type
 
 See :ref:`here <advanced.numericindex>` for more.
 
+.. _whatsnew_140.enhancements.styler:
+
+Styler
+^^^^^^
+
+:class:`.Styler` has been further developed in 1.4.0. The following enhancements have been made:
+
+  - Styling of indexing has been added, with :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index`. These mirror the signature of the methods already used to style data values, and work with both HTML and LaTeX format (:issue:`41893`).
+  - :meth:`.Styler.bar` introduces additional arguments to control alignment and display (:issue:`26070`, :issue:`36419`), and it also validates the input arguments ``width`` and ``height`` (:issue:`42511`).
+  - :meth:`.Styler.to_latex` introduces keyword argument ``environment``, which also allows a specific "longtable" entry through a separate jinja2 template (:issue:`41866`).
+  - :meth:`.Styler.to_html` introduces keyword arguments ``sparse_index`` and ``sparse_columns`` (:issue:`41946`)
+  - Keyword argument ``level`` is added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`)
+
+There are also bug fixes and deprecations listed below.
+
 .. _whatsnew_140.enhancements.enhancement2:
 
 enhancement2
@@ -75,13 +90,7 @@ Other enhancements
 - :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`)
 - Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
--  Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
-- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
-- Add keyword ``level`` to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`)
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
-- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
-- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
-- :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index` added to allow conditional styling of index and column header values for HTML and LaTeX (:issue:`41893`)
 - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
 -
 
@@ -239,7 +248,6 @@ Categorical
 - Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
 - Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
 - Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
-- Bug in :meth:`.Styler.copy` where ``uuid`` was not previously copied (:issue:`40675`)
 -
 
 Datetimelike
@@ -295,7 +303,7 @@ Indexing
 
 Missing
 ^^^^^^^
--
+- Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`)
 -
 
 MultiIndex
@@ -355,6 +363,9 @@ ExtensionArray
 Styler
 ^^^^^^
 - Minor bug in :class:`.Styler` where the ``uuid`` at initialization maintained a floating underscore (:issue:`43037`)
+- Bug in :meth:`.Styler.to_html` where the ``Styler`` object was updated if the ``to_html`` method was called with some args (:issue:`43034`)
+- Bug in :meth:`.Styler.copy` where ``uuid`` was not previously copied (:issue:`40675`)
+- Bug in :meth:`Styler.apply` where functions which returned Series objects were not correctly handled in terms of aligning their index labels (:issue:`13657`, :issue:`42014`)
 -
 
 Other

diff --git a/environment.yml b/environment.yml
@@ -105,6 +105,7 @@ dependencies:
 
   - pytables>=3.6.1  # pandas.read_hdf, DataFrame.to_hdf
   - s3fs>=0.4.0  # file IO when using 's3://...' path
+  - aiobotocore<=1.3.3  # Remove when s3fs is at 2021.08.0
   - fsspec>=0.7.4, <2021.6.0  # for generic remote file operations
   - gcsfs>=0.6.0  # file IO when using 'gcs://...' path
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -388,10 +388,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_any_all(int8_t[::1] out,
-                  const int8_t[::1] values,
+def group_any_all(int8_t[:, ::1] out,
+                  const int8_t[:, :] values,
                   const intp_t[::1] labels,
-                  const uint8_t[::1] mask,
+                  const uint8_t[:, :] mask,
                   str val_test,
                   bint skipna,
                   bint nullable) -> None:
@@ -426,9 +426,9 @@ def group_any_all(int8_t[::1] out,
     -1 to signify a masked position in the case of a nullable input.
     """
     cdef:
-        Py_ssize_t i, N = len(labels)
+        Py_ssize_t i, j, N = len(labels), K = out.shape[1]
         intp_t lab
-        int8_t flag_val
+        int8_t flag_val, val
 
     if val_test == 'all':
         # Because the 'all' value of an empty iterable in Python is True we can
@@ -448,21 +448,27 @@ def group_any_all(int8_t[::1] out,
     with nogil:
         for i in range(N):
             lab = labels[i]
-            if lab < 0 or (skipna and mask[i]):
+            if lab < 0:
                 continue
 
-            if nullable and mask[i]:
-                # Set the position as masked if `out[lab] != flag_val`, which
-                # would indicate True/False has not yet been seen for any/all,
-                # so by Kleene logic the result is currently unknown
-                if out[lab] != flag_val:
-                    out[lab] = -1
-                continue
+            for j in range(K):
+                if skipna and mask[i, j]:
+                    continue
+
+                if nullable and mask[i, j]:
+                    # Set the position as masked if `out[lab] != flag_val`, which
+                    # would indicate True/False has not yet been seen for any/all,
+                    # so by Kleene logic the result is currently unknown
+                    if out[lab, j] != flag_val:
+                        out[lab, j] = -1
+                    continue
+
+                val = values[i, j]
 
-            # If True and 'any' or False and 'all', the result is
-            # already determined
-            if values[i] == flag_val:
-                out[lab] = flag_val
+                # If True and 'any' or False and 'all', the result is
+                # already determined
+                if val == flag_val:
+                    out[lab, j] = flag_val
 
 
 # ----------------------------------------------------------------------

diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
@@ -10,6 +10,7 @@ import numpy as np
 from pandas._typing import (
     ArrayLike,
     T,
+    npt,
 )
 
 from pandas import Index
@@ -25,6 +26,12 @@ def get_blkno_placements(
     blknos: np.ndarray,
     group: bool = ...,
 ) -> Iterator[tuple[int, BlockPlacement]]: ...
+def update_blklocs_and_blknos(
+    blklocs: npt.NDArray[np.intp],
+    blknos: npt.NDArray[np.intp],
+    loc: int,
+    nblocks: int,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
 
 class BlockPlacement:
     def __init__(self, val: int | slice | np.ndarray): ...

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -210,6 +210,39 @@ cdef class BlockPlacement:
 
         return self._as_slice
 
+    cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
+        """
+        Increment any entries of 'loc' or above by one.
+        """
+        cdef:
+            slice nv, s = self._ensure_has_slice()
+            Py_ssize_t other_int, start, stop, step, l
+            ndarray newarr
+
+        if s is not None:
+            # see if we are either all-above or all-below, each of which
+            #  have fastpaths available.
+
+            start, stop, step, l = slice_get_indices_ex(s)
+
+            if start < loc and stop <= loc:
+                # We are entirely below, nothing to increment
+                return self
+
+            if start >= loc and stop >= loc:
+                # We are entirely above, we can efficiently increment out slice
+                nv = slice(start + 1, stop + 1, step)
+                return BlockPlacement(nv)
+
+        if loc == 0:
+            # fastpath where we know everything is >= 0
+            newarr = self.as_array + 1
+            return BlockPlacement(newarr)
+
+        newarr = self.as_array.copy()
+        newarr[newarr >= loc] += 1
+        return BlockPlacement(newarr)
+
     def tile_for_unstack(self, factor: int) -> np.ndarray:
         """
         Find the new mgr_locs for the un-stacked version of a Block.
@@ -481,6 +514,35 @@ def get_blkno_placements(blknos, group: bool = True):
         yield blkno, BlockPlacement(indexer)
 
 
+cpdef update_blklocs_and_blknos(
+    ndarray[intp_t] blklocs, ndarray[intp_t] blknos, Py_ssize_t loc, intp_t nblocks
+):
+    """
+    Update blklocs and blknos when a new column is inserted at 'loc'.
+    """
+    cdef:
+        Py_ssize_t i
+        cnp.npy_intp length = len(blklocs) + 1
+        ndarray[intp_t] new_blklocs, new_blknos
+
+    # equiv: new_blklocs = np.empty(length, dtype=np.intp)
+    new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
+    new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
+
+    for i in range(loc):
+        new_blklocs[i] = blklocs[i]
+        new_blknos[i] = blknos[i]
+
+    new_blklocs[loc] = 0
+    new_blknos[loc] = nblocks
+
+    for i in range(loc, length - 1):
+        new_blklocs[i + 1] = blklocs[i]
+        new_blknos[i + 1] = blknos[i]
+
+    return new_blklocs, new_blknos
+
+
 @cython.freelist(64)
 cdef class SharedBlock:
     """