Skip to content

Commit

Permalink
Merge branch 'main' into feature/44764_perf_issue_new
Browse files Browse the repository at this point in the history
  • Loading branch information
smarie committed Jan 13, 2024
2 parents ea28669 + c778746 commit a94aec0
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 314 deletions.
21 changes: 0 additions & 21 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
pandas.Series.plot.line \
pandas.Series.to_sql \
pandas.Series.to_latex \
pandas.errors.CategoricalConversionWarning \
pandas.errors.ChainedAssignmentError \
pandas.errors.ClosedFileError \
pandas.errors.DatabaseError \
pandas.errors.IndexingError \
pandas.errors.InvalidColumnName \
pandas.errors.NumExprClobberingError \
pandas.errors.PossibleDataLossError \
pandas.errors.PossiblePrecisionLoss \
pandas.errors.SettingWithCopyError \
Expand Down Expand Up @@ -109,21 +105,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
pandas.Index.rename \
pandas.Index.droplevel \
pandas.Index.isin \
pandas.CategoricalIndex.set_categories \
pandas.MultiIndex.names \
pandas.MultiIndex.droplevel \
pandas.IndexSlice \
pandas.DatetimeIndex.month_name \
pandas.DatetimeIndex.day_name \
pandas.core.window.rolling.Rolling.corr \
pandas.Grouper \
pandas.core.groupby.SeriesGroupBy.apply \
pandas.core.groupby.DataFrameGroupBy.apply \
pandas.core.groupby.SeriesGroupBy.transform \
pandas.core.groupby.SeriesGroupBy.pipe \
pandas.core.groupby.DataFrameGroupBy.pipe \
pandas.core.groupby.DataFrameGroupBy.boxplot \
pandas.core.groupby.DataFrameGroupBy.hist \
pandas.io.formats.style.Styler.map \
pandas.io.formats.style.Styler.apply_index \
pandas.io.formats.style.Styler.map_index \
Expand All @@ -141,18 +126,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
pandas.io.formats.style.Styler.text_gradient \
pandas.DataFrame.values \
pandas.DataFrame.groupby \
pandas.DataFrame.skew \
pandas.DataFrame.var \
pandas.DataFrame.idxmax \
pandas.DataFrame.idxmin \
pandas.DataFrame.pivot \
pandas.DataFrame.sort_values \
pandas.DataFrame.tz_convert \
pandas.DataFrame.tz_localize \
pandas.DataFrame.plot.bar \
pandas.DataFrame.plot.hexbin \
pandas.DataFrame.plot.line \
pandas.DataFrame.hist \
RET=$(($RET + $?)) ; echo $MSG "DONE"

fi
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/tslibs/offsets.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4860,15 +4860,15 @@ cpdef to_offset(freq, bint is_period=False):

tups = zip(split[0::4], split[1::4], split[2::4])
for n, (sep, stride, name) in enumerate(tups):
if is_period is False and name in c_OFFSET_DEPR_FREQSTR:
if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR:
warnings.warn(
f"\'{name}\' is deprecated and will be removed "
f"in a future version, please use "
f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.",
f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
name = c_OFFSET_DEPR_FREQSTR[name]
name = c_OFFSET_DEPR_FREQSTR[name.upper()]
if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR:
if name.startswith("Y"):
raise ValueError(
Expand Down
18 changes: 9 additions & 9 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ class ChainedAssignmentError(Warning):
--------
>>> pd.options.mode.copy_on_write = True
>>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
>>> df["A"][0:3] = 10 # doctest: +SKIP
>>> df["A"][0:3] = 10 # doctest: +SKIP
... # ChainedAssignmentError: ...
>>> pd.options.mode.copy_on_write = False
"""
Expand Down Expand Up @@ -561,10 +561,10 @@ class NumExprClobberingError(NameError):
Examples
--------
>>> df = pd.DataFrame({'abs': [1, 1, 1]})
>>> df.query("abs > 2") # doctest: +SKIP
>>> df.query("abs > 2") # doctest: +SKIP
... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap...
>>> sin, a = 1, 2
>>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP
>>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP
... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap...
"""

Expand Down Expand Up @@ -677,9 +677,9 @@ class ClosedFileError(Exception):
Examples
--------
>>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
>>> store.close() # doctest: +SKIP
>>> store.keys() # doctest: +SKIP
>>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
>>> store.close() # doctest: +SKIP
>>> store.keys() # doctest: +SKIP
... # ClosedFileError: my-store file is not open!
"""

Expand Down Expand Up @@ -773,9 +773,9 @@ class CategoricalConversionWarning(Warning):
Examples
--------
>>> from pandas.io.stata import StataReader
>>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP
... for i, block in enumerate(reader):
... print(i, block)
>>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP
... for i, block in enumerate(reader):
... print(i, block)
... # CategoricalConversionWarning: One or more series with value labels...
"""

Expand Down
116 changes: 116 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,33 @@ def test_agg_apply_corner(ts, tsframe):
tm.assert_frame_equal(res, exp_df)


def test_with_na_groups(any_real_numpy_dtype):
index = Index(np.arange(10))
values = Series(np.ones(10), index, dtype=any_real_numpy_dtype)
labels = Series(
[np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
index=index,
)

# this SHOULD be an int
grouped = values.groupby(labels)
agged = grouped.agg(len)
expected = Series([4, 2], index=["bar", "foo"])

tm.assert_series_equal(agged, expected, check_dtype=False)

# assert issubclass(agged.dtype.type, np.integer)

# explicitly return a float from my function
def f(x):
return float(len(x))

agged = grouped.agg(f)
expected = Series([4.0, 2.0], index=["bar", "foo"])

tm.assert_series_equal(agged, expected)


def test_agg_grouping_is_list_tuple(ts):
df = DataFrame(
np.random.default_rng(2).standard_normal((30, 4)),
Expand Down Expand Up @@ -1049,6 +1076,73 @@ def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
tm.assert_frame_equal(result, expected)


def test_groupby_as_index_agg(df):
grouped = df.groupby("A", as_index=False)

# single-key

result = grouped[["C", "D"]].agg("mean")
expected = grouped.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)

result2 = grouped.agg({"C": "mean", "D": "sum"})
expected2 = grouped.mean(numeric_only=True)
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)

grouped = df.groupby("A", as_index=True)

msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped["C"].agg({"Q": "sum"})

# multi-key

grouped = df.groupby(["A", "B"], as_index=False)

result = grouped.agg("mean")
expected = grouped.mean()
tm.assert_frame_equal(result, expected)

result2 = grouped.agg({"C": "mean", "D": "sum"})
expected2 = grouped.mean()
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)

expected3 = grouped["C"].sum()
expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result3 = grouped["C"].agg({"Q": "sum"})
tm.assert_frame_equal(result3, expected3)

# GH7115 & GH8112 & GH8582
df = DataFrame(
np.random.default_rng(2).integers(0, 100, (50, 3)),
columns=["jim", "joe", "jolie"],
)
ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim")

gr = df.groupby(ts)
gr.nth(0) # invokes set_selection_from_grouper internally

msg = "The behavior of DataFrame.sum with axis=None is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
res = gr.apply(sum)
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
alt = df.groupby(ts).apply(sum)
tm.assert_frame_equal(res, alt)

for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
gr = df.groupby(ts, as_index=False)
left = getattr(gr, attr)()

gr = df.groupby(ts.values, as_index=True)
right = getattr(gr, attr)().reset_index(drop=True)

tm.assert_frame_equal(left, right)


@pytest.mark.parametrize(
"func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
)
Expand Down Expand Up @@ -1252,6 +1346,28 @@ def test_agg_multiple_lambda(self):
tm.assert_frame_equal(result2, expected)


def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
# go through _aggregate_frame with self.axis == 0 and duplicate columns
tsframe.columns = ["A", "B", "A", "C"]
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)

warn = None if as_index else FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
res = gb.agg(np.percentile, 80, axis=0)

ex_data = {
1: tsframe[tsframe.index.month == 1].quantile(0.8),
2: tsframe[tsframe.index.month == 2].quantile(0.8),
}
expected = DataFrame(ex_data).T
if not as_index:
# TODO: try to get this more consistent?
expected.index = Index(range(2))

tm.assert_frame_equal(res, expected)


def test_groupby_get_by_index():
# GH 33439
df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
Expand Down
72 changes: 72 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1602,3 +1602,75 @@ def test_builtins_apply(keys, f):
tm.assert_frame_equal(result, expected, check_dtype=False)

tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))


def test_inconsistent_return_type():
# GH5592
# inconsistent return type
df = DataFrame(
{
"A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
"B": Series(np.arange(7), dtype="int64"),
"C": pd.date_range("20130101", periods=7),
}
)

def f_0(grp):
return grp.iloc[0]

expected = df.groupby("A").first()[["B"]]
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").apply(f_0)[["B"]]
tm.assert_frame_equal(result, expected)

def f_1(grp):
if grp.name == "Tiger":
return None
return grp.iloc[0]

msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").apply(f_1)[["B"]]
e = expected.copy()
e.loc["Tiger"] = np.nan
tm.assert_frame_equal(result, e)

def f_2(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]

msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").apply(f_2)[["B"]]
e = expected.copy()
e.loc["Pony"] = np.nan
tm.assert_frame_equal(result, e)

# 5592 revisited, with datetimes
def f_3(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]

msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").apply(f_3)[["C"]]
e = df.groupby("A").first()[["C"]]
e.loc["Pony"] = pd.NaT
tm.assert_frame_equal(result, e)

# scalar outputs
def f_4(grp):
if grp.name == "Pony":
return None
return grp.iloc[0].loc["C"]

msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").apply(f_4)
e = df.groupby("A").first()["C"].copy()
e.loc["Pony"] = np.nan
e.name = None
tm.assert_series_equal(result, e)

0 comments on commit a94aec0

Please sign in to comment.