In [1]:
from typing import List, Dict
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', None, 'b', 'b', 'a', None],
                  'key2': pd.Series([1, 2, 1, 2, 1, None, 1], dtype='Int64'),
                  'data1': np.random.standard_normal(7),
                  'data2': np.random.standard_normal(7)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-0.886677,-0.304075
1,a,2.0,-1.592214,-0.041614
2,,1.0,-2.185377,1.536094
3,b,2.0,2.155221,-0.882579
4,b,1.0,-0.62644,1.74491
5,a,,0.108844,0.356165
6,,1.0,1.44302,0.193343


In [4]:
#Let's compute the mean of the data1 column using the labels from key1
#one way is to access data1 and call groupby with the column (a series) at key1
grouped = df['data1'].groupby(df['key1'])

In [5]:
#GroupBy object: contains intermediate data about the information needed to apply the operations
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff66a752050>

In [6]:
#calculate the mean
grouped.mean()

key1
a   -0.790016
b    0.764391
Name: data1, dtype: float64

In [7]:
#above, we split the data on the group key, producing a new Series indexed by values in the key1 column
grouped2 = df['data2'].groupby(df['key1'])

In [8]:
grouped2

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff66a7e4400>

In [9]:
grouped2.mean()

key1
a    0.003492
b    0.431166
Name: data2, dtype: float64

In [10]:
#passing multiple arrays as a list:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [11]:
means #now has heirarchical index

key1  key2
a     1      -0.886677
      2      -1.592214
b     1      -0.626440
      2       2.155221
Name: data1, dtype: float64

In [12]:
means.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['key1', 'key2'])

In [13]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.886677,-1.592214
b,-0.62644,2.155221


In [14]:
#In the following example, the group keys are Series, but we can use any array.
#The only limitation: lengths must be equal.
states = np.array(['OH', 'CA', 'CA', 'OH', 'OH', 'CA', 'OH'])

In [15]:
years : List[int] = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

In [16]:
df['data1'].groupby([states, years]).mean()

CA  2005   -0.741685
    2006   -2.185377
OH  2005    0.634272
    2006    0.408290
Name: data1, dtype: float64

In [17]:
df['data1'].groupby([years, states]).mean()

2005  CA   -0.741685
      OH    0.634272
2006  CA   -2.185377
      OH    0.408290
Name: data1, dtype: float64

In [18]:
df['data1'].groupby([years, states]).mean()

2005  CA   -0.741685
      OH    0.634272
2006  CA   -2.185377
      OH    0.408290
Name: data1, dtype: float64

In [19]:
#frequently we find the grouping information in the same DataFrame we are working with.
#if so, we can pass the column names as group keys
df.groupby('key1').mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.790016,0.003492
b,1.5,0.764391,0.431166


In [20]:
df.groupby('key2').mean(numeric_only=True) #we do not have 'key1' in there because it is nonnumeric (i.e. nuisance column)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.563868,0.792568
2,0.281504,-0.462096


In [21]:
#multiple columns as keys
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-0.886677,-0.304075
a,2,-1.592214,-0.041614
b,1,-0.62644,1.74491
b,2,2.155221,-0.882579


In [22]:
#another important method is .size()
df.groupby(['key1', 'key2']).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [23]:
#note: any missing values in a group key are excluded from the result by default.
#how to disable: pass dropna=False to groupby
df.groupby('key1', dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [24]:
df.groupby(['key1', 'key2'], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [25]:
#count computes the number of nonnull values in each group
df.groupby('key1').count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [26]:
#iterating over groups
#object returned by groupby supports iteration, creating a sequence of 2-Tuples with
#the group name and the chunk of data
#consider the following:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1 -0.886677 -0.304075
1    a     2 -1.592214 -0.041614
5    a  <NA>  0.108844  0.356165
b
  key1  key2     data1     data2
3    b     2  2.155221 -0.882579
4    b     1 -0.626440  1.744910


In [27]:
#if there are multiple keys, then the first element in the list is the tuple of key values:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2     data1     data2
0    a     1 -0.886677 -0.304075
('a', 2)
  key1  key2     data1     data2
1    a     2 -1.592214 -0.041614
('b', 1)
  key1  key2    data1    data2
4    b     1 -0.62644  1.74491
('b', 2)
  key1  key2     data1     data2
3    b     2  2.155221 -0.882579


In [28]:
#we can do anything we want with the data
#E.g.: a dictionary of the data as a one-liner
pieces = {name: group for name, group in df.groupby('key1')}
pieces['b']

Unnamed: 0,key1,key2,data1,data2
3,b,2,2.155221,-0.882579
4,b,1,-0.62644,1.74491


In [29]:
#we can work on any axis, although axis='index' by default
grouped = df.groupby({'key1': 'key', 'key2': 'key',
                     'data1': 'data', 'data2': 'data'}, axis='columns')

In [30]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff66a81cca0>

In [31]:
#we can print out the groups like this:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0 -0.886677 -0.304075
1 -1.592214 -0.041614
2 -2.185377  1.536094
3  2.155221 -0.882579
4 -0.626440  1.744910
5  0.108844  0.356165
6  1.443020  0.193343
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


In [32]:
#select column or subset of columns
#df.groupby('key1')['data1']
#or
#df.groupby('key1')[['data2']]
#are conveniences for:
#df['data1'].groupby(df['key1'])
#df[['data2']].groupby(df['key1'])
#reason: large datasets may require that only a few columns be aggregated
#compute the means for the 'data2' column only:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,-0.304075
a,2,-0.041614
b,1,1.74491
b,2,-0.882579


In [33]:
#object returned by this indexing option:
#1) if we pass a list or series we get a grouped DataFrame 
#2) if we pass a single column, we get a grouped Series
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff66a81e1d0>

In [34]:
s_grouped.mean()

key1  key2
a     1      -0.304075
      2      -0.041614
b     1       1.744910
      2      -0.882579
Name: data2, dtype: float64

In [35]:
#grouping with dictionaries and series
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                    columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steven', 'Wanda', 'Jill', 'Trey'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.108992,-0.441321,0.964288,0.266299,-0.133302
Steven,-0.708168,-0.378783,0.410194,1.125199,-0.25627
Wanda,0.077001,0.368164,-0.615041,0.576952,-0.716069
Jill,1.571196,-0.073099,-1.983265,-0.327292,1.506435
Trey,0.660479,0.532839,0.30019,0.443682,-0.414753


In [36]:
people.iloc[2:3, [1, 2]] = np.nan #add a few NA values

In [37]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.108992,-0.441321,0.964288,0.266299,-0.133302
Steven,-0.708168,-0.378783,0.410194,1.125199,-0.25627
Wanda,0.077001,,,0.576952,-0.716069
Jill,1.571196,-0.073099,-1.983265,-0.327292,1.506435
Trey,0.660479,0.532839,0.30019,0.443682,-0.414753


In [38]:
#let's suppose we have a group corresponding to the columns and want to sum them as a group
mapping : Dict = {'a': 'red', 'b': 'red', 'c': 'blue',
          'd': 'blue', 'e': 'red', 'f': 'orange'}

In [45]:
#we could construct an array from this dictionary to pass to groupby but here we will just pass
#the dictionary.
by_column = people.groupby(mapping, axis='columns')

In [46]:
by_column.sum()

Unnamed: 0,blue,red
Joe,1.230588,-0.465631
Steven,1.535393,-1.343221
Wanda,0.576952,-0.639068
Jill,-2.310557,3.004532
Trey,0.743872,0.778566


In [47]:
#same as above but with a number for the axis
by_column2 = people.groupby(mapping, axis=1)

In [48]:
by_column2.sum()

Unnamed: 0,blue,red
Joe,1.230588,-0.465631
Steven,1.535393,-1.343221
Wanda,0.576952,-0.639068
Jill,-2.310557,3.004532
Trey,0.743872,0.778566


In [49]:
#We can also use this for pd.Series objects, which seem like a fixed-size mapping.
map_series = pd.Series(mapping)

In [50]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [51]:
#pass to groupby
people.groupby(map_series, axis='columns').count()

Unnamed: 0,blue,red
Joe,2,3
Steven,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


In [55]:
#same as above.
#Note: if we were to use axis=1, then the result would be an empty DataFrame.
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steven,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


In [56]:
#grouping with functions
#example: len
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.108992,-0.441321,0.964288,0.266299,-0.133302
4,2.231675,0.45974,-1.683075,0.116389,1.091682
5,0.077001,0.0,0.0,0.576952,-0.716069
6,-0.708168,-0.378783,0.410194,1.125199,-0.25627


In [None]:
#We can use arrays, dictionaries or Series. Pandas converts these into 