Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into bisect
Browse files Browse the repository at this point in the history
  • Loading branch information
simonjayhawkins committed Aug 25, 2021
2 parents 641695f + e218f05 commit 704c32e
Show file tree
Hide file tree
Showing 43 changed files with 888 additions and 431 deletions.
8 changes: 8 additions & 0 deletions asv_bench/benchmarks/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd
from pandas import (
Categorical,
DataFrame,
MultiIndex,
Series,
Expand Down Expand Up @@ -31,6 +32,9 @@ def setup(self):
self.dict_list = frame.to_dict(orient="records")
self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}

# arrays which we wont consolidate
self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)}

def time_list_of_dict(self):
DataFrame(self.dict_list)

Expand All @@ -50,6 +54,10 @@ def time_nested_dict_int64(self):
# nested dict, integer indexes, regression described in #621
DataFrame(self.data2)

def time_dict_of_categoricals(self):
# dict of arrays that we wont consolidate
DataFrame(self.dict_of_categoricals)


class FromSeries:
def setup(self):
Expand Down
35 changes: 24 additions & 11 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def time_srs_bfill(self):

class GroupByMethods:

param_names = ["dtype", "method", "application"]
param_names = ["dtype", "method", "application", "ncols"]
params = [
["int", "float", "object", "datetime", "uint"],
[
Expand Down Expand Up @@ -443,15 +443,23 @@ class GroupByMethods:
"var",
],
["direct", "transformation"],
[1, 2, 5, 10],
]

def setup(self, dtype, method, application):
def setup(self, dtype, method, application, ncols):
if method in method_blocklist.get(dtype, {}):
raise NotImplementedError # skip benchmark

if ncols != 1 and method in ["value_counts", "unique"]:
# DataFrameGroupBy doesn't have these methods
raise NotImplementedError

ngroups = 1000
size = ngroups * 2
rng = np.arange(ngroups)
values = rng.take(np.random.randint(0, ngroups, size=size))
rng = np.arange(ngroups).reshape(-1, 1)
rng = np.broadcast_to(rng, (len(rng), ncols))
taker = np.random.randint(0, ngroups, size=size)
values = rng.take(taker, axis=0)
if dtype == "int":
key = np.random.randint(0, size, size=size)
elif dtype == "uint":
Expand All @@ -465,22 +473,27 @@ def setup(self, dtype, method, application):
elif dtype == "datetime":
key = date_range("1/1/2011", periods=size, freq="s")

df = DataFrame({"values": values, "key": key})
cols = [f"values{n}" for n in range(ncols)]
df = DataFrame(values, columns=cols)
df["key"] = key

if len(cols) == 1:
cols = cols[0]

if application == "transform":
if method == "describe":
raise NotImplementedError

self.as_group_method = lambda: df.groupby("key")["values"].transform(method)
self.as_field_method = lambda: df.groupby("values")["key"].transform(method)
self.as_group_method = lambda: df.groupby("key")[cols].transform(method)
self.as_field_method = lambda: df.groupby(cols)["key"].transform(method)
else:
self.as_group_method = getattr(df.groupby("key")["values"], method)
self.as_field_method = getattr(df.groupby("values")["key"], method)
self.as_group_method = getattr(df.groupby("key")[cols], method)
self.as_field_method = getattr(df.groupby(cols)["key"], method)

def time_dtype_as_group(self, dtype, method, application):
def time_dtype_as_group(self, dtype, method, application, ncols):
self.as_group_method()

def time_dtype_as_field(self, dtype, method, application):
def time_dtype_as_field(self, dtype, method, application, ncols):
self.as_field_method()


Expand Down
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,11 +366,20 @@ class InsertColumns:
def setup(self):
self.N = 10 ** 3
self.df = DataFrame(index=range(self.N))
self.df2 = DataFrame(np.random.randn(self.N, 2))

def time_insert(self):
for i in range(100):
self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)

def time_insert_middle(self):
# same as time_insert but inserting to a middle column rather than
# front or back (which have fast-paths)
for i in range(100):
self.df2.insert(
1, "colname", np.random.randn(self.N), allow_duplicates=True
)

def time_assign_with_setitem(self):
for i in range(100):
self.df[i] = np.random.randn(self.N)
Expand Down
1 change: 1 addition & 0 deletions ci/deps/actions-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.2
- aiobotocore<=1.3.3
- scipy
- sqlalchemy
- xlrd
Expand Down
1 change: 1 addition & 0 deletions ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.0
- aiobotocore<=1.3.3
- scipy
- xlrd
- xlsxwriter
Expand Down
1 change: 1 addition & 0 deletions ci/deps/azure-windows-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies:
- python-dateutil
- pytz
- s3fs>=0.4.2
- aiobotocore<=1.3.3
- scipy
- sqlalchemy
- xlrd
Expand Down
6 changes: 6 additions & 0 deletions doc/source/getting_started/comparison/comparison_with_sql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,12 @@ default, :meth:`~pandas.DataFrame.join` will join the DataFrames on their indice
parameters allowing you to specify the type of join to perform (``LEFT``, ``RIGHT``, ``INNER``,
``FULL``) or the columns to join on (column names or indices).

.. warning::

If both key columns contain rows where the key is a null value, those
rows will be matched against each other. This is different from usual SQL
join behaviour and can lead to unexpected results.

.. ipython:: python
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
Expand Down
27 changes: 19 additions & 8 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ In Pandas 2.0, :class:`NumericIndex` will become the default numeric index type

See :ref:`here <advanced.numericindex>` for more.

.. _whatsnew_140.enhancements.styler:

Styler
^^^^^^

:class:`.Styler` has been further developed in 1.4.0. The following enhancements have been made:

- Styling of indexing has been added, with :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index`. These mirror the signature of the methods already used to style data values, and work with both HTML and LaTeX format (:issue:`41893`).
- :meth:`.Styler.bar` introduces additional arguments to control alignment and display (:issue:`26070`, :issue:`36419`), and it also validates the input arguments ``width`` and ``height`` (:issue:`42511`).
- :meth:`.Styler.to_latex` introduces keyword argument ``environment``, which also allows a specific "longtable" entry through a separate jinja2 template (:issue:`41866`).
- :meth:`.Styler.to_html` introduces keyword arguments ``sparse_index`` and ``sparse_columns`` (:issue:`41946`)
- Keyword argument ``level`` is added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`)

There are also bug fixes and deprecations listed below.

.. _whatsnew_140.enhancements.enhancement2:

enhancement2
Expand All @@ -75,13 +90,7 @@ Other enhancements
- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`)
- Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
- :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
- Add keyword ``level`` to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`)
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
- :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index` added to allow conditional styling of index and column header values for HTML and LaTeX (:issue:`41893`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
-

Expand Down Expand Up @@ -239,7 +248,6 @@ Categorical
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
- Bug in :meth:`.Styler.copy` where ``uuid`` was not previously copied (:issue:`40675`)
-

Datetimelike
Expand Down Expand Up @@ -295,7 +303,7 @@ Indexing

Missing
^^^^^^^
-
- Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`)
-

MultiIndex
Expand Down Expand Up @@ -355,6 +363,9 @@ ExtensionArray
Styler
^^^^^^
- Minor bug in :class:`.Styler` where the ``uuid`` at initialization maintained a floating underscore (:issue:`43037`)
- Bug in :meth:`.Styler.to_html` where the ``Styler`` object was updated if the ``to_html`` method was called with some args (:issue:`43034`)
- Bug in :meth:`.Styler.copy` where ``uuid`` was not previously copied (:issue:`40675`)
- Bug in :meth:`Styler.apply` where functions which returned Series objects were not correctly handled in terms of aligning their index labels (:issue:`13657`, :issue:`42014`)
-

Other
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ dependencies:

- pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf
- s3fs>=0.4.0 # file IO when using 's3://...' path
- aiobotocore<=1.3.3 # Remove when s3fs is at 2021.08.0
- fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
Expand Down
40 changes: 23 additions & 17 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,

@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(int8_t[::1] out,
const int8_t[::1] values,
def group_any_all(int8_t[:, ::1] out,
const int8_t[:, :] values,
const intp_t[::1] labels,
const uint8_t[::1] mask,
const uint8_t[:, :] mask,
str val_test,
bint skipna,
bint nullable) -> None:
Expand Down Expand Up @@ -426,9 +426,9 @@ def group_any_all(int8_t[::1] out,
-1 to signify a masked position in the case of a nullable input.
"""
cdef:
Py_ssize_t i, N = len(labels)
Py_ssize_t i, j, N = len(labels), K = out.shape[1]
intp_t lab
int8_t flag_val
int8_t flag_val, val

if val_test == 'all':
# Because the 'all' value of an empty iterable in Python is True we can
Expand All @@ -448,21 +448,27 @@ def group_any_all(int8_t[::1] out,
with nogil:
for i in range(N):
lab = labels[i]
if lab < 0 or (skipna and mask[i]):
if lab < 0:
continue

if nullable and mask[i]:
# Set the position as masked if `out[lab] != flag_val`, which
# would indicate True/False has not yet been seen for any/all,
# so by Kleene logic the result is currently unknown
if out[lab] != flag_val:
out[lab] = -1
continue
for j in range(K):
if skipna and mask[i, j]:
continue

if nullable and mask[i, j]:
# Set the position as masked if `out[lab] != flag_val`, which
# would indicate True/False has not yet been seen for any/all,
# so by Kleene logic the result is currently unknown
if out[lab, j] != flag_val:
out[lab, j] = -1
continue

val = values[i, j]

# If True and 'any' or False and 'all', the result is
# already determined
if values[i] == flag_val:
out[lab] = flag_val
# If True and 'any' or False and 'all', the result is
# already determined
if val == flag_val:
out[lab, j] = flag_val


# ----------------------------------------------------------------------
Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/internals.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import numpy as np
from pandas._typing import (
ArrayLike,
T,
npt,
)

from pandas import Index
Expand All @@ -25,6 +26,12 @@ def get_blkno_placements(
blknos: np.ndarray,
group: bool = ...,
) -> Iterator[tuple[int, BlockPlacement]]: ...
def update_blklocs_and_blknos(
blklocs: npt.NDArray[np.intp],
blknos: npt.NDArray[np.intp],
loc: int,
nblocks: int,
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...

class BlockPlacement:
def __init__(self, val: int | slice | np.ndarray): ...
Expand Down
62 changes: 62 additions & 0 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,39 @@ cdef class BlockPlacement:

return self._as_slice

cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
"""
Increment any entries of 'loc' or above by one.
"""
cdef:
slice nv, s = self._ensure_has_slice()
Py_ssize_t other_int, start, stop, step, l
ndarray newarr

if s is not None:
# see if we are either all-above or all-below, each of which
# have fastpaths available.

start, stop, step, l = slice_get_indices_ex(s)

if start < loc and stop <= loc:
# We are entirely below, nothing to increment
return self

if start >= loc and stop >= loc:
# We are entirely above, we can efficiently increment out slice
nv = slice(start + 1, stop + 1, step)
return BlockPlacement(nv)

if loc == 0:
# fastpath where we know everything is >= 0
newarr = self.as_array + 1
return BlockPlacement(newarr)

newarr = self.as_array.copy()
newarr[newarr >= loc] += 1
return BlockPlacement(newarr)

def tile_for_unstack(self, factor: int) -> np.ndarray:
"""
Find the new mgr_locs for the un-stacked version of a Block.
Expand Down Expand Up @@ -481,6 +514,35 @@ def get_blkno_placements(blknos, group: bool = True):
yield blkno, BlockPlacement(indexer)


cpdef update_blklocs_and_blknos(
ndarray[intp_t] blklocs, ndarray[intp_t] blknos, Py_ssize_t loc, intp_t nblocks
):
"""
Update blklocs and blknos when a new column is inserted at 'loc'.
"""
cdef:
Py_ssize_t i
cnp.npy_intp length = len(blklocs) + 1
ndarray[intp_t] new_blklocs, new_blknos

# equiv: new_blklocs = np.empty(length, dtype=np.intp)
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)

for i in range(loc):
new_blklocs[i] = blklocs[i]
new_blknos[i] = blknos[i]

new_blklocs[loc] = 0
new_blknos[loc] = nblocks

for i in range(loc, length - 1):
new_blklocs[i + 1] = blklocs[i]
new_blknos[i + 1] = blknos[i]

return new_blklocs, new_blknos


@cython.freelist(64)
cdef class SharedBlock:
"""
Expand Down
Loading

0 comments on commit 704c32e

Please sign in to comment.