In [1]:
import pandas as pd
import numpy as np

In [2]:
speeds = pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

In [3]:
grouped = speeds.groupby("class")

In [4]:

grouped = speeds.groupby(["class", "order"])

In [5]:
# On a DataFrame, we obtain a GroupBy object by calling groupby(). This method returns a pandas.api.typing.DataFrameGroupBy instance. We could naturally group by either the A or B columns, or both:

In [6]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)



In [7]:
grouped = df.groupby("A")
grouped = df.groupby("B")

grouped = df.groupby(["A", "B"])

In [8]:
# If we also have a MultiIndex on columns A and B, we can group by all the columns except the one we specify:

In [9]:
df2 = df.set_index(["A", "B"])

grouped = df2.groupby(level=df2.index.names.difference(["B"]))

grouped.sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.248948,-0.568995
foo,-2.592153,2.237029


In [None]:
# If the MultiIndex has names specified, these can be passed instead of the level number:

In [211]:
s.groupby(level="second").sum()

second
one   -4.834216
two   -1.658858
dtype: float64

In [None]:
# Grouping with multiple levels is supported.

In [212]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["doo", "doo", "bee", "bee", "bop", "bop", "bop", "bop"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]


index = pd.MultiIndex.from_arrays(arrays, names=["first", "second", "third"])

s = pd.Series(np.random.randn(8), index=index)

s

first  second  third
bar    doo     one      0.938424
               two     -1.504585
baz    bee     one     -1.238113
               two      0.064703
foo    bop     one      0.246346
               two      1.917488
qux    bop     one     -1.929041
               two      0.248576
dtype: float64

In [213]:
s.groupby(level=["first", "second"]).sum()


first  second
bar    doo      -0.566161
baz    bee      -1.173410
foo    bop       2.163833
qux    bop      -1.680465
dtype: float64

In [None]:
# Index level names may be supplied as keys.

In [214]:
s.groupby(["first", "second"]).sum()

first  second
bar    doo      -0.566161
baz    bee      -1.173410
foo    bop       2.163833
qux    bop      -1.680465
dtype: float64

In [10]:
# The above GroupBy will split the DataFrame on its index (rows). To split by columns, first do a transpose:

In [11]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'
grouped = df.T.groupby(get_letter_type)

In [12]:
# pandas Index objects support duplicate values. If a non-unique index is used as the group key in a groupby operation, all values for the same index value will be considered to be in one group and thus the output of aggregation functions will only contain unique index values:

In [13]:
index = [1, 2, 3, 1, 2, 3]

s = pd.Series([1, 2, 3, 10, 20, 30], index=index)
grouped = s.groupby(level=0)

In [14]:
grouped = s.groupby(level=0)
grouped.first()

1    1
2    2
3    3
dtype: int64

In [15]:
grouped.last()

1    10
2    20
3    30
dtype: int64

In [16]:
# Note that no splitting occurs until it’s needed. Creating the GroupBy object only verifies that you’ve passed a valid mapping.

In [17]:
# roupBy sorting
# By default the group keys are sorted during the groupby operation. You may however pass sort=False for potential speedups. With sort=False the order among group-keys follows the order of appearance of the keys in the original dataframe:

In [18]:
df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})

df2.groupby(["X"]).sum()
df2.groupby(["X"], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [19]:
# Note that groupby will preserve the order in which observations are sorted within each group. For example, the groups created by groupby() below are in the order they appeared in the original DataFrame:

In [20]:
df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})

df3.groupby("X").get_group("A")

Unnamed: 0,X,Y
0,A,1
2,A,3


In [21]:
df3.groupby(["X"]).get_group(("B",))

Unnamed: 0,X,Y
1,B,4
3,B,2


In [22]:
# GroupBy dropna
# By default NA values are excluded from group keys during the groupby operation. However, in case you want to include NA values in group keys, you could pass dropna=False to achieve it.

In [23]:
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]

df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])

df_dropna

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [24]:
# # Default ``dropna`` is set to True, which will exclude NaNs in keys
df_dropna.groupby(by=["b"], dropna=True).sum()


Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5


In [25]:
# In order to allow NaN in keys, set ``dropna`` to False
df_dropna.groupby(by=["b"], dropna=False).sum()


Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5
,1,4


In [26]:
# The default setting of dropna argument is True which means NA are not included in group keys.

In [27]:
# GroupBy object attributes

In [28]:
# The groups attribute is a dictionary whose keys are the computed unique groups and corresponding values are the axis labels belonging to each group. In the above example we have:

In [29]:
df.groupby("A").groups
df.T.groupby(get_letter_type).groups

{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [30]:
# Calling the standard Python len function on the GroupBy object returns the number of groups, which is the same as the length of the groups dictionary:

In [31]:
grouped = df.groupby(["A", "B"])
grouped.groups

{('bar', 'one'): [1], ('bar', 'three'): [3], ('bar', 'two'): [5], ('foo', 'one'): [0, 6], ('foo', 'three'): [7], ('foo', 'two'): [2, 4]}

In [32]:
# GroupBy will tab complete column names, GroupBy operations, and other attributes

In [33]:
n = 10

weight = np.random.normal(166, 20, size=n)

height = np.random.normal(60, 10, size=n)

time = pd.date_range("1/1/2000", periods=n)

gender = np.random.choice(["male", "female"], size=n)

df = pd.DataFrame(
    {"height": height, "weight": weight, "gender": gender}, index=time
)

In [34]:
gb = df.groupby("gender")
gb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A17C156BD0>

In [35]:
# GroupBy with MultiIndex

In [None]:
# With hierarchically-indexed data, it’s quite natural to group by one of the levels of the hierarchy.

# Let’s create a Series with a two-level MultiIndex.

In [209]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])

s = pd.Series(np.random.randn(8), index=index)

s

first  second
bar    one      -0.320626
       two       0.646731
baz    one      -1.596884
       two       0.348157
foo    one      -1.127960
       two      -1.179154
qux    one      -1.788747
       two      -1.474592
dtype: float64

In [None]:
# We can then group by one of the levels in s.

In [210]:
grouped=s.groupby(level=0)
grouped.sum()

first
bar    0.326105
baz   -1.248728
foo   -2.307114
qux   -3.263339
dtype: float64

In [36]:
# Grouping DataFrame with Index levels and columns

In [37]:
# A DataFrame may be grouped by a combination of columns and index levels. You can specify both column and index names, or use a Grouper.

# Let’s first create a DataFrame with a MultiIndex:

In [38]:
arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]


index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"])

df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [39]:
# Then we group df by the second index level and the A column.

In [40]:
df.groupby([pd.Grouper(level=1), "A"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [41]:
# Index levels may also be specified by name.
df.groupby([pd.Grouper(level="second"), "A"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [42]:
# DataFrame column selection in GroupBy

In [43]:
# Once you have created the GroupBy object from a DataFrame, you might want to do something different for each of the columns. Thus, by using [] on the GroupBy object in a similar way as the one used to get a column from a DataFrame, you can do:

In [44]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
grouped = df.groupby(["A"])

grouped_C = grouped["C"]

grouped_D = grouped["D"]


In [45]:
# This is mainly syntactic sugar for the alternative, which is much more verbose:

In [46]:
df["C"].groupby(df["A"])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001A17A79F4D0>

In [47]:
# Additionally, this method avoids recomputing the internal grouping information derived from the passed key.

# You can also include the grouping columns if you want to operate on them.

In [48]:
grouped[["A", "B"]].sum()

Unnamed: 0_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,barbarbar,onethreetwo
foo,foofoofoofoofoo,onetwotwoonethree


In [49]:
# Iterating through groups

In [50]:
# With the GroupBy object in hand, iterating through the grouped data is very natural and functions similarly to itertools.groupby():

In [51]:
grouped = df.groupby('A')
for name,group in grouped:
    print(name)
    print(group)

bar
     A      B         C         D
1  bar    one  0.564427 -0.119328
3  bar  three -2.184516 -4.454747
5  bar    two -0.196985 -1.744903
foo
     A      B         C         D
0  foo    one -0.251731 -0.083246
2  foo    two  0.214375  0.374121
4  foo    two -1.115284  0.894033
6  foo    one  0.238427 -0.536058
7  foo  three -0.427488  0.571318


In [52]:
# In the case of grouping by multiple keys, the group name will be a tuple:

In [53]:
for name,group in df.groupby(['A','B']):
    print(name)
    print(group)

('bar', 'one')
     A    B         C         D
1  bar  one  0.564427 -0.119328
('bar', 'three')
     A      B         C         D
3  bar  three -2.184516 -4.454747
('bar', 'two')
     A    B         C         D
5  bar  two -0.196985 -1.744903
('foo', 'one')
     A    B         C         D
0  foo  one -0.251731 -0.083246
6  foo  one  0.238427 -0.536058
('foo', 'three')
     A      B         C         D
7  foo  three -0.427488  0.571318
('foo', 'two')
     A    B         C         D
2  foo  two  0.214375  0.374121
4  foo  two -1.115284  0.894033


In [54]:
# Selecting a group

In [55]:
# A single group can be selected using DataFrameGroupBy.get_group():

In [56]:
grouped.get_group("bar")

Unnamed: 0,A,B,C,D
1,bar,one,0.564427,-0.119328
3,bar,three,-2.184516,-4.454747
5,bar,two,-0.196985,-1.744903


In [57]:
# Or for an object grouped on multiple columns:

In [58]:
df.groupby(["A", "B"]).get_group(("bar", "one"))

Unnamed: 0,A,B,C,D
1,bar,one,0.564427,-0.119328


In [59]:
# # Aggregation
# An aggregation is a GroupBy operation that reduces the dimension of the grouping object. The result of an aggregation is, or at least is treated as, a scalar value for each column in a group. For example, producing the sum of each column in a group of values.

In [60]:
animals = pd.DataFrame(
    {
        "kind": ["cat", "dog", "cat", "dog"],
        "height": [9.1, 6.0, 9.5, 34.0],
        "weight": [7.9, 7.5, 9.9, 198.0],
    }
)

In [61]:
animals.groupby("kind").sum()

Unnamed: 0_level_0,height,weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,18.6,17.8
dog,40.0,205.5


In [62]:
# In the result, the keys of the groups appear in the index by default. They can be instead included in the columns by passing as_index=False.

In [63]:
animals.groupby("kind", as_index=False).sum()

Unnamed: 0,kind,height,weight
0,cat,18.6,17.8
1,dog,40.0,205.5


In [64]:
# Built-in aggregation methods

In [65]:
# Many common aggregations are built-in to GroupBy objects as methods. Of the methods listed below, those with a * do not have an efficient, GroupBy-specific, implementation.

In [66]:
df.groupby("A")[["C", "D"]].max()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.564427,-0.119328
foo,0.238427,0.894033


In [67]:
df.groupby("A")[["C", "D"]].mean()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.605691,-2.106326
foo,-0.26834,0.244033


In [68]:
# Another aggregation example is to compute the size of each group. This is included in GroupBy as the size method. It returns a Series whose index consists of the group names and the values are the sizes of each group.

In [69]:
grouped = df.groupby(["A", "B"])
grouped.size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [70]:
# While the DataFrameGroupBy.describe() method is not itself a reducer, it can be used to conveniently produce a collection of summary statistics about each of the groups.

In [71]:
grouped.describe().T

Unnamed: 0_level_0,A,bar,bar,bar,foo,foo,foo
Unnamed: 0_level_1,B,one,three,two,one,three,two
C,count,1.0,1.0,1.0,2.0,1.0,2.0
C,mean,0.564427,-2.184516,-0.196985,-0.006652,-0.427488,-0.450454
C,std,,,,0.346594,,0.940211
C,min,0.564427,-2.184516,-0.196985,-0.251731,-0.427488,-1.115284
C,25%,0.564427,-2.184516,-0.196985,-0.129192,-0.427488,-0.782869
C,50%,0.564427,-2.184516,-0.196985,-0.006652,-0.427488,-0.450454
C,75%,0.564427,-2.184516,-0.196985,0.115887,-0.427488,-0.118039
C,max,0.564427,-2.184516,-0.196985,0.238427,-0.427488,0.214375
D,count,1.0,1.0,1.0,2.0,1.0,2.0
D,mean,-0.119328,-4.454747,-1.744903,-0.309652,0.571318,0.634077


In [72]:
# Another aggregation example is to compute the number of unique values of each group. This is similar to the DataFrameGroupBy.value_counts() function, except that it only counts the number of unique values.

In [73]:
ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]]

df4 = pd.DataFrame(ll, columns=["A", "B"])

In [74]:
df4.groupby("A")["B"].nunique()

A
bar    1
foo    2
Name: B, dtype: int64

In [75]:
# Aggregation functions will not return the groups that you are aggregating over as named columns when as_index=True, the default. The grouped columns will be the indices of the returned object.

# Passing as_index=False will return the groups that you are aggregating over as named columns, regardless if they are named indices or columns in the inputs.

In [76]:
# The aggregate() method
# Note

# The aggregate() method can accept many different types of inputs. This section details using string aliases for various GroupBy methods; other inputs are detailed in the sections below.

In [77]:
# Any reduction method that pandas implements can be passed as a string to aggregate(). Users are encouraged to use the shorthand, agg. It will operate as if the corresponding method was called.

In [78]:
grouped = df.groupby("A")
grouped[["C","D"]].aggregate("sum")

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.817073,-6.318978
foo,-1.341701,1.220167


In [79]:
grouped=df.groupby(["A","B"])
grouped.agg("sum")

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.564427,-0.119328
bar,three,-2.184516,-4.454747
bar,two,-0.196985,-1.744903
foo,one,-0.013305,-0.619305
foo,three,-0.427488,0.571318
foo,two,-0.900908,1.268154


In [80]:
# The result of the aggregation will have the group names as the new index. In the case of multiple keys, the result is a MultiIndex by default. As mentioned above, this can be changed by using the as_index option:

In [81]:
grouped = df.groupby(["A", "B"], as_index=False)
grouped.agg("sum")

Unnamed: 0,A,B,C,D
0,bar,one,0.564427,-0.119328
1,bar,three,-2.184516,-4.454747
2,bar,two,-0.196985,-1.744903
3,foo,one,-0.013305,-0.619305
4,foo,three,-0.427488,0.571318
5,foo,two,-0.900908,1.268154


In [82]:
df.groupby("A", as_index=False)[["C", "D"]].agg("sum")

Unnamed: 0,A,C,D
0,bar,-1.817073,-6.318978
1,foo,-1.341701,1.220167


In [83]:
# Note that you could use the DataFrame.reset_index() DataFrame function to achieve the same result as the column names are stored in the resulting MultiIndex, although this will make an extra copy.

In [84]:
df.groupby(["A", "B"]).agg("sum").reset_index()

Unnamed: 0,A,B,C,D
0,bar,one,0.564427,-0.119328
1,bar,three,-2.184516,-4.454747
2,bar,two,-0.196985,-1.744903
3,foo,one,-0.013305,-0.619305
4,foo,three,-0.427488,0.571318
5,foo,two,-0.900908,1.268154


In [85]:
# Aggregation with User-Defined Functions

In [86]:
# Users can also provide their own User-Defined Functions (UDFs) for custom aggregations.

In [87]:
animals.groupby("kind")[["height"]].agg(lambda x: set(x))

Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,"{9.1, 9.5}"
dog,"{34.0, 6.0}"


In [88]:
# The resulting dtype will reflect that of the aggregating function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as DataFrame construction.



In [89]:
animals.groupby("kind")[["height"]].agg(lambda x: x.astype(int).sum())


Unnamed: 0_level_0,height
kind,Unnamed: 1_level_1
cat,18
dog,40


In [90]:
# Applying multiple functions at once

In [91]:
grouped = df.groupby("A")
grouped["C"].agg(['sum','mean','std'])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-1.817073,-0.605691,1.419314
foo,-1.341701,-0.26834,0.555145


In [92]:
# On a grouped DataFrame, you can pass a list of functions to DataFrameGroupBy.agg() to aggregate each column, which produces an aggregated result with a hierarchical column index:

In [93]:
grouped[["C", "D"]].agg(["sum", "mean", "std"])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-1.817073,-0.605691,1.419314,-6.318978,-2.106326,2.190191
foo,-1.341701,-0.26834,0.555145,1.220167,0.244033,0.561737


In [94]:
# The resulting aggregations are named after the functions themselves. If you need to rename, then you can add in a chained operation for a Series like this:

In [95]:
(
    grouped["C"]
    .agg(["sum", "mean", "std"])
    .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"})
)


Unnamed: 0_level_0,foo,bar,baz
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-1.817073,-0.605691,1.419314
foo,-1.341701,-0.26834,0.555145


In [96]:
# For a grouped DataFrame, you can rename in a similar manner:

In [97]:
(
    grouped[["C", "D"]].agg(["sum", "mean", "std"]).rename(
        columns={"sum": "foo", "mean": "bar", "std": "baz"}
    )
)


Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,foo,bar,baz,foo,bar,baz
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-1.817073,-0.605691,1.419314,-6.318978,-2.106326,2.190191
foo,-1.341701,-0.26834,0.555145,1.220167,0.244033,0.561737


In [98]:
# In general, the output column names should be unique, but pandas will allow you apply to the same function (or two functions with the same name) to the same column.



In [99]:
grouped["C"].agg(["sum", "sum"])

Unnamed: 0_level_0,sum,sum
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.817073,-1.817073
foo,-1.341701,-1.341701


In [100]:
# pandas also allows you to provide multiple lambdas. In this case, pandas will mangle the name of the (nameless) lambda functions, appending _<i> to each subsequent lambda.

In [101]:
grouped["C"].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()])


Unnamed: 0_level_0,<lambda_0>,<lambda_1>
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2.748943,0.408706
foo,1.35371,0.016609


In [102]:
# Named aggregation

In [103]:
# o support column-specific aggregation with control over the output column names, pandas accepts the special syntax in DataFrameGroupBy.agg() and SeriesGroupBy.agg(), known as “named aggregation”, where

# The keywords are the output column names

# The values are tuples whose first element is the column to select and the second element is the aggregation to apply to that column. pandas provides the NamedAgg namedtuple with the fields ['column', 'aggfunc'] to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias.

In [104]:
animals.groupby("kind").agg(
    min_height=pd.NamedAgg(column="height", aggfunc="min"),
    max_height=pd.NamedAgg(column="height", aggfunc="max"),
    average_weight=pd.NamedAgg(column="weight", aggfunc="mean"),
)

Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [105]:
# NamedAgg is just a namedtuple. Plain tuples are allowed as well.

In [106]:
animals.groupby("kind").agg(
    min_height=("height", "min"),
    max_height=("height", "max"),
    average_weight=("weight", "mean"),
)


Unnamed: 0_level_0,min_height,max_height,average_weight
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,9.1,9.5,8.9
dog,6.0,34.0,102.75


In [107]:
# If the column names you want are not valid Python keywords, construct a dictionary and unpack the keyword arguments

In [108]:
animals.groupby("kind").agg(
    **{
        "total weight": pd.NamedAgg(column="weight", aggfunc="sum")
    }
)

Unnamed: 0_level_0,total weight
kind,Unnamed: 1_level_1
cat,17.8
dog,205.5


In [109]:
# When using named aggregation, additional keyword arguments are not passed through to the aggregation functions; only pairs of (column, aggfunc) should be passed as **kwargs. If your aggregation functions require additional arguments, apply them partially with functools.partial().

# Named aggregation is also valid for Series groupby aggregations. In this case there’s no column selection, so the values are just the functions.

In [110]:
animals.groupby("kind").height.agg(
    min_height="min",
    max_height="max",
)


Unnamed: 0_level_0,min_height,max_height
kind,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.1,9.5
dog,6.0,34.0


In [111]:
# Applying different functions to DataFrame columns. By passing a dict to aggregate you can apply a different aggregation to the columns of a DataFrame:

In [112]:
grouped.agg({"C": "sum", "D": lambda x: np.std(x, ddof=1)})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.817073,2.190191
foo,-1.341701,0.561737


In [113]:
# The function names can also be strings. In order for a string to be valid it must be implemented on GroupBy:

In [114]:
grouped.agg({"C": "sum", "D": "std"})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.817073,2.190191
foo,-1.341701,0.561737


In [115]:
# Transformation

In [116]:
# A transformation is a GroupBy operation whose result is indexed the same as the one being grouped. Common examples include cumsum() and diff().

In [117]:
grouped = speeds.groupby("class")["max_speed"]

grouped.cumsum()

falcon     389.0
parrot     413.0
lion        80.2
monkey       NaN
leopard    138.2
Name: max_speed, dtype: float64

In [118]:
grouped.diff()

falcon       NaN
parrot    -365.0
lion         NaN
monkey       NaN
leopard      NaN
Name: max_speed, dtype: float64

In [119]:
# Unlike aggregations, the groupings that are used to split the original object are not included in the result.

In [120]:
# Since transformations do not include the groupings that are used to split the result, the arguments as_index and sort in DataFrame.groupby() and Series.groupby() have no effect.

In [121]:
# A common use of a transformation is to add the result back into the original DataFrame.

In [122]:
result = speeds.copy()

result["cumsum"] = grouped.cumsum()

result["diff"] = grouped.diff()
result

Unnamed: 0,class,order,max_speed,cumsum,diff
falcon,bird,Falconiformes,389.0,389.0,
parrot,bird,Psittaciformes,24.0,413.0,-365.0
lion,mammal,Carnivora,80.2,80.2,
monkey,mammal,Primates,,,
leopard,mammal,Carnivora,58.0,138.2,


In [123]:
# Built-in transformation methods

In [124]:
# Method

# Description

# bfill()

# Back fill NA values within each group

# cumcount()

# Compute the cumulative count within each group

# cummax()

# Compute the cumulative max within each group

# cummin()

# Compute the cumulative min within each group

# cumprod()

# Compute the cumulative product within each group

# cumsum()

# Compute the cumulative sum within each group

# diff()

# Compute the difference between adjacent values within each group

# ffill()

# Forward fill NA values within each group

# pct_change()

# Compute the percent change between adjacent values within each group

# rank()

# Compute the rank of each value within each group

# shift()

# Shift values up or down within each group

In [125]:
# In addition, passing any built-in aggregation method as a string to transform() (see the next section) will broadcast the result across the group, producing a transformed result. If the aggregation method has an efficient implementation, this will be performant as well.

In [126]:
# The transform() method

In [127]:
# Similar to the aggregation method, the transform() method can accept string aliases to the built-in transformation methods in the previous section. It can also accept string aliases to the built-in aggregation methods. When an aggregation method is provided, the result will be broadcast across the group.

In [128]:
grouped = speeds.groupby("class")[["max_speed"]]
grouped.transform("cumsum")

Unnamed: 0,max_speed
falcon,389.0
parrot,413.0
lion,80.2
monkey,
leopard,138.2


In [129]:
grouped.transform("sum")

Unnamed: 0,max_speed
falcon,413.0
parrot,413.0
lion,138.2
monkey,138.2
leopard,138.2


In [130]:
# In addition to string aliases, the transform() method can also accept User-Defined Functions (UDFs). The UDF must:

# Return a result that is either the same size as the group chunk or broadcastable to the size of the group chunk (e.g., a scalar, grouped.transform(lambda x: x.iloc[-1])).

# Operate column-by-column on the group chunk. The transform is applied to the first group chunk using chunk.apply.

# Not perform in-place operations on the group chunk. Group chunks should be treated as immutable, and changes to a group chunk may produce unexpected results. See Mutating with User Defined Function (UDF) methods for more information.

# (Optionally) operates on all columns of the entire group chunk at once. If this is supported, a fast path is used starting from the second chunk.

In [131]:
# Built-in filtrations

In [132]:
# The following methods on GroupBy act as filtrations. All these methods have an efficient, GroupBy-specific, implementation.

In [133]:
# Users can also use transformations along with Boolean indexing to construct complex filtrations within groups. For example, suppose we are given groups of products and their volumes, and we wish to subset the data to only the largest products capturing no more than 90% of the total volume within each group.

In [134]:
product_volumes = pd.DataFrame(
    {
        "group": list("xxxxyyy"),
        "product": list("abcdefg"),
        "volume": [10, 30, 20, 15, 40, 10, 20],
    }
)

In [135]:
product_volumes = product_volumes.sort_values("volume", ascending=False)

In [136]:
grouped = product_volumes.groupby("group")["volume"]
cumpct = grouped.cumsum() / grouped.transform("sum")

In [137]:
significant_products = product_volumes[cumpct <= 0.9]
significant_products.sort_values(["group", "product"])

Unnamed: 0,group,product,volume
1,x,b,30
2,x,c,20
3,x,d,15
4,y,e,40
6,y,g,20


In [138]:
# The filter method

In [139]:
# Filtering by supplying filter with a User-Defined Function (UDF) is often less performant than using the built-in methods on GroupBy. Consider breaking up a complex operation into a chain of operations that utilize the built-in methods.

In [140]:
# he filter method takes a User-Defined Function (UDF) that, when applied to an entire group, returns either True or False. The result of the filter method is then the subset of groups for which the UDF returned True.

In [141]:
# Suppose we want to take only elements that belong to groups with a group sum greater than 2.

In [142]:
sf = pd.Series([1, 1, 2, 3, 3, 3])
sf.groupby(sf).filter(lambda x:x.sum()>2)

3    3
4    3
5    3
dtype: int64

In [143]:
# Another useful operation is filtering out elements that belong to groups with only a couple members.

In [144]:
dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")})

dff.groupby("B").filter(lambda x: len(x) > 2)


Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b


In [145]:
# Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are filled with NaNs.

In [146]:
dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False)

Unnamed: 0,A,B
0,,
1,,
2,2.0,b
3,3.0,b
4,4.0,b
5,5.0,b
6,,
7,,


In [147]:
# For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion.

In [148]:
dff["C"] = np.arange(8)
dff.groupby("B").filter(lambda x: len(x["C"]) > 2)


Unnamed: 0,A,B,C
2,2,b,2
3,3,b,3
4,4,b,4
5,5,b,5


In [149]:
# Flexible apply

In [150]:
# Some operations on the grouped data might not fit into the aggregation, transformation, or filtration categories. For these, you can use the apply function

In [151]:
# ss
# apply has to try to infer from the result whether it should act as a reducer, transformer, or filter, depending on exactly what is passed to it. Thus the grouped column(s) may be included in the output or not. While it tries to intelligently guess how to behave, it can sometimes guess wrong. All of the examples in this section can be more reliably, and more efficiently, computed using other pandas functionality.

In [152]:
grouped = df.groupby("A")

grouped["C"].apply(lambda x: x.describe())

A         
bar  count    3.000000
     mean    -0.605691
     std      1.419314
     min     -2.184516
     25%     -1.190750
     50%     -0.196985
     75%      0.183721
     max      0.564427
foo  count    5.000000
     mean    -0.268340
     std      0.555145
     min     -1.115284
     25%     -0.427488
     50%     -0.251731
     75%      0.214375
     max      0.238427
Name: C, dtype: float64

In [153]:
grouped = df.groupby('A')['C']
def f(group):
    return pd.DataFrame({'original': group,
                         'demeaned': group - group.mean()})


grouped.apply(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,original,demeaned
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,1,0.564427,1.170118
bar,3,-2.184516,-1.578825
bar,5,-0.196985,0.408706
foo,0,-0.251731,0.016609
foo,2,0.214375,0.482716
foo,4,-1.115284,-0.846943
foo,6,0.238427,0.506767
foo,7,-0.427488,-0.159148


In [154]:
# apply on a Series can operate on a returned value from the applied function that is itself a series, and possibly upcast the result to a DataFrame:

In [155]:
def f(x):
    return pd.Series([x, x ** 2], index=["x", "x^2"])
s = pd.Series(np.random.rand(5))

s

0    0.708603
1    0.485002
2    0.388081
3    0.357014
4    0.894827
dtype: float64

In [156]:
# Similar to The aggregate() method, the resulting dtype will 
# reflect that of the apply function. 
# If the results from different groups have different dtypes, then a common dtype will be determined in the same way as DataFrame construction.

In [157]:
# Control grouped column(s) placement with group_keys

In [158]:
# To control whether the grouped column(s) are included in the indices, you can use the argument group_keys which defaults to True. Compare

In [159]:
df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,1,one,0.564427,-0.119328
bar,3,three,-2.184516,-4.454747
bar,5,two,-0.196985,-1.744903
foo,0,one,-0.251731,-0.083246
foo,2,two,0.214375,0.374121
foo,4,two,-1.115284,0.894033
foo,6,one,0.238427,-0.536058
foo,7,three,-0.427488,0.571318


In [160]:
df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)

Unnamed: 0,B,C,D
0,one,-0.251731,-0.083246
1,one,0.564427,-0.119328
2,two,0.214375,0.374121
3,three,-2.184516,-4.454747
4,two,-1.115284,0.894033
5,two,-0.196985,-1.744903
6,one,0.238427,-0.536058
7,three,-0.427488,0.571318


In [161]:
# Numba Accelerated Routines

In [162]:
# If Numba is installed as an optional dependency, the transform and aggregate methods support engine='numba' and engine_kwargs arguments. See enhancing performance with Numba for general usage of the arguments and performance considerations.

# The function signature must start with values, index exactly as the data belonging to each group will be passed into values, and the group index will be passed into index.

In [163]:
# Suppose we wish to compute the standard deviation grouped by the A column. There is a slight problem, namely that we don’t care about the data in column B because it is not numeric. You can avoid non-numeric columns by specifying numeric_only=True

In [164]:
df.groupby("A").std(numeric_only=True)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.419314,2.190191
foo,0.555145,0.561737


In [165]:
# Other useful features
# Exclusion of non-numeric columns
# Again consider the example DataFrame we’ve been looking a

In [166]:
# Note that df.groupby('A').colname.std(). is more efficient than df.groupby('A').std().colname. So if the result of an aggregation function is only needed over one column (here colname), it may be filtered before applying the aggregation function.

In [167]:
from decimal import Decimal

df_dec = pd.DataFrame(
    {
        "id": [1, 2, 1, 2],
        "int_column": [1, 2, 3, 4],
        "dec_column": [
            Decimal("0.50"),
            Decimal("0.15"),
            Decimal("0.25"),
            Decimal("0.40"),
        ],
    }
)


df_dec.groupby(["id"])[["dec_column"]].sum()

Unnamed: 0_level_0,dec_column
id,Unnamed: 1_level_1
1,0.75
2,0.55


In [168]:
# Handling of (un)observed Categorical values

In [169]:
## N/A Group handling

In [170]:
# Taking the first rows of each group

In [171]:
# Just like for a DataFrame or Series you can call head and tail on a groupby:

In [172]:
df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])

g=df.groupby("A")

g.head(1)

Unnamed: 0,A,B
0,1,2
2,5,6


In [173]:
# df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])

g = df.groupby("A")

g.nth(0)
g.nth(-1)


Unnamed: 0,A,B
1,1,4
2,5,6


In [174]:
# If the nth element of a group does not exist, then no corresponding row is included in the result. In particular, if the specified n is larger than any group, the result will be an empty DataFrame.

In [175]:
g.nth(5)

Unnamed: 0,A,B


In [176]:
# If you want to select the nth not-null item, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna:

In [177]:
g.nth(0, dropna="any")

Unnamed: 0,A,B
0,1,2
2,5,6


In [178]:
g.nth(-1, dropna="any")
g.B.nth(0, dropna="all")

0    2
2    6
Name: B, dtype: int64

In [179]:
# You can also select multiple rows from each group by specifying multiple nth values as a list of ints.

In [180]:
business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")

df = pd.DataFrame(1, index=business_dates, columns=["a", "b"])

In [181]:
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])

Unnamed: 0,a,b
2014-04-01,1,1
2014-04-04,1,1
2014-04-30,1,1
2014-05-01,1,1
2014-05-06,1,1
2014-05-30,1,1
2014-06-02,1,1
2014-06-05,1,1
2014-06-30,1,1


In [182]:
# You may also use slices or lists of slices.

In [183]:
df.groupby([df.index.year, df.index.month]).nth[1:]
# Out[250]: 

Unnamed: 0,a,b
2014-04-02,1,1
2014-04-03,1,1
2014-04-04,1,1
2014-04-07,1,1
2014-04-08,1,1
...,...,...
2014-06-24,1,1
2014-06-25,1,1
2014-06-26,1,1
2014-06-27,1,1


In [184]:
df.groupby([df.index.year, df.index.month]).nth[1:, :-1]

Unnamed: 0,a,b
2014-04-01,1,1
2014-04-02,1,1
2014-04-03,1,1
2014-04-04,1,1
2014-04-07,1,1
...,...,...
2014-06-24,1,1
2014-06-25,1,1
2014-06-26,1,1
2014-06-27,1,1


In [185]:
# Enumerate group items
# To see the order in which each row appears within its group, use the cumcount method:

In [186]:
dfg = pd.DataFrame(list("aaabba"), columns=["A"])

dfg

Unnamed: 0,A
0,a
1,a
2,a
3,b
4,b
5,a


In [187]:
dfg.groupby("A").cumcount()

0    0
1    1
2    2
3    0
4    1
5    3
dtype: int64

In [188]:
dfg.groupby("A").cumcount(ascending=False)

0    3
1    2
2    1
3    1
4    0
5    0
dtype: int64

In [189]:
# Enumerate groups
# To see the ordering of the groups (as opposed to the order of rows within a group given by cumcount) you can use DataFrameGroupBy.ngroup().

# Note that the numbers given to the groups match the order in which the groups would be seen when iterating over the groupby object, not the order they are first observed.

In [190]:
dfg = pd.DataFrame(list("aaabba"), columns=["A"])

dfg
dfg.groupby("A").ngroup()
dfg.groupby("A").ngroup(ascending=False)

0    1
1    1
2    1
3    0
4    0
5    1
dtype: int64

In [216]:
dfg.groupby("A").cumcount()

0    0
1    1
2    0
3    0
4    1
dtype: int64

In [217]:
dfg.groupby("A").cumcount(ascending=False)

0    1
1    0
2    1
3    0
4    0
dtype: int64

In [191]:
## Plotting

In [192]:
# Groupby also works with some plotting methods. In this case, suppose we suspect that the values in column 1 are 3 times higher on average in group “B

In [193]:
Groupby also works with some plotting methods. In this case, suppose we suspect that the values in column 1 are 3 times higher on average in group “B”.    

SyntaxError: invalid character '“' (U+201C) (2855563306.py, line 1)

In [None]:
np.random.seed(1234)

df = pd.DataFrame(np.random.randn(50, 2))

df["g"] = np.random.choice(["A", "B"], size=50)

df.loc[df["g"] == "B", 1] += 3

In [None]:
# # Piping function calls
# Similar to the functionality provided by DataFrame and Series, functions that take GroupBy objects can be chained together using a pipe method to allow for a cleaner, more readable syntax. To read about .pipe in general terms, see here.

# Combining .groupby and .pipe is often useful when you need to reuse GroupBy objects.

# As an example, imagine having a DataFrame with columns for stores, products, revenue and quantity sold. We’d like to do a groupwise calculation of prices (i.e. revenue/quantity) per store and per product. We could do this in a multi-step operation, but expressing it in terms of piping can make the code more readable. First we set the data:

In [195]:
n = 1000

df = pd.DataFrame(
    {
        "Store": np.random.choice(["Store_1", "Store_2"], n),
        "Product": np.random.choice(["Product_1", "Product_2"], n),
        "Revenue": (np.random.random(n) * 50 + 10).round(2),
        "Quantity": np.random.randint(1, 10, size=n),
    }
)
df.head(2)


Unnamed: 0,Store,Product,Revenue,Quantity
0,Store_2,Product_1,48.76,5
1,Store_2,Product_1,49.62,9


In [196]:
df.groupby("g").boxplot()

KeyError: 'g'

In [None]:
# We now find the prices per store/product.

In [197]:
(
    df.groupby(["Store", "Product"])
    .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum())
    .unstack()
    .round(2)
)

Product,Product_1,Product_2
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
Store_1,6.97,6.98
Store_2,7.15,7.17


In [None]:
# Piping can also be expressive when you want to deliver a grouped object to some arbitrary function, for example:

In [198]:
def mean(groupby):
    return groupby.mean()

In [199]:
df.groupby(["Store","Product"]).pipe(mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue,Quantity
Store,Product,Unnamed: 2_level_1,Unnamed: 3_level_1
Store_1,Product_1,34.7739,4.988417
Store_1,Product_2,35.172319,5.041825
Store_2,Product_1,36.067082,5.046693
Store_2,Product_2,36.799412,5.135747


In [None]:
# Here mean takes a GroupBy object and finds the mean of the Revenue and Quantity columns respectively for each Store-Product combination. The mean function can be any function that takes in a GroupBy object; the .pipe will pass the GroupBy object as a parameter into the function you specify.

In [None]:
# Multi-column factorization
# By using DataFrameGroupBy.ngroup(), we can extract information about the groups in a way similar to factorize() (as described further in the reshaping API) but which applies naturally to multiple columns of mixed type and different sources. This can be useful as an intermediate categorical-like step in processing, when the relationships between the group rows are more important than their content, or as input to an algorithm which only accepts the integer encoding. (For more information about support in pandas for full categorical data, see the Categorical introduction and the API documentation.)

In [202]:
dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})

dfg.groupby("A").ngroup()

0    0
1    0
2    1
3    2
4    1
dtype: int64

In [None]:
# Groupby by indexer to ‘resample’ data

In [None]:
# Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.

# In order for resample to work on indices that are non-datetimelike, the following procedure can be utilized.

# In the following examples, df.index // 5 returns an integer array which is used to determine what gets selected for the groupby operation.

In [203]:
df = pd.DataFrame(np.random.randn(10, 2))

df

Unnamed: 0,0,1
0,0.312497,-0.79478
1,0.461732,-0.301024
2,1.666037,0.476523
3,-0.541154,-1.0404
4,0.547492,-0.615143
5,0.560791,0.002973
6,-0.296152,0.982263
7,0.429754,-2.044408
8,0.924316,0.819888
9,-1.139013,0.505119


In [204]:
df.index // 5

Index([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype='int64')

In [205]:
df.groupby(df.index // 5).std()

Unnamed: 0,0,1
0,0.78771,0.586352
1,0.820453,1.230618


In [None]:
# Returning a Series to propagate names

In [None]:
# Group DataFrame columns, compute a set of metrics and return a named Series. The Series name is used as the name for the column index. This is especially useful in conjunction with reshaping operations such as stacking, in which the column index name will be used as the name of the inserted column:

In [206]:
df = pd.DataFrame(
    {
        "a": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
        "b": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
        "c": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
        "d": [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
    }
)

In [207]:
def compute_metrics(x):
    result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()}
    return pd.Series(result, name="metrics")


result = df.groupby("a").apply(compute_metrics, include_groups=False)

result

metrics,b_sum,c_mean
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.0,0.5
1,2.0,0.5
2,2.0,0.5


In [208]:
result.stack(future_stack=True)


a  metrics
0  b_sum      2.0
   c_mean     0.5
1  b_sum      2.0
   c_mean     0.5
2  b_sum      2.0
   c_mean     0.5
dtype: float64