In [1]:
from typing import List, Dict
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', None, 'b', 'b', 'a', None],
                  'key2': pd.Series([1, 2, 1, 2, 1, None, 1], dtype='Int64'),
                  'data1': np.random.standard_normal(7),
                  'data2': np.random.standard_normal(7)})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.43456,-0.238092
1,a,2.0,-1.605228,2.514051
2,,1.0,0.348471,-2.277146
3,b,2.0,-0.108427,-0.169864
4,b,1.0,-0.976403,-0.918126
5,a,,-0.584094,1.704576
6,,1.0,1.368047,-0.219631


In [4]:
#Let's compute the mean of the data1 column using the labels from key1
#one way is to access data1 and call groupby with the column (a series) at key1
grouped = df['data1'].groupby(df['key1'])

In [5]:
#GroupBy object: contains intermediate data about the information needed to apply the operations
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f8cd0f68940>

In [6]:
#calculate the mean
grouped.mean()

key1
a   -0.584920
b   -0.542415
Name: data1, dtype: float64

In [7]:
#above, we split the data on the group key, producing a new Series indexed by values in the key1 column
grouped2 = df['data2'].groupby(df['key1'])

In [8]:
grouped2

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f8cd0f69d20>

In [9]:
grouped2.mean()

key1
a    1.326845
b   -0.543995
Name: data2, dtype: float64

In [10]:
#passing multiple arrays as a list:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [11]:
means #now has heirarchical index

key1  key2
a     1       0.434560
      2      -1.605228
b     1      -0.976403
      2      -0.108427
Name: data1, dtype: float64

In [12]:
means.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['key1', 'key2'])

In [13]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.43456,-1.605228
b,-0.976403,-0.108427


In [14]:
#In the following example, the group keys are Series, but we can use any array.
#The only limitation: lengths must be equal.
states = np.array(['OH', 'CA', 'CA', 'OH', 'OH', 'CA', 'OH'])

In [15]:
years : List[int] = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

In [16]:
df['data1'].groupby([states, years]).mean()

CA  2005   -1.094661
    2006    0.348471
OH  2005    0.163067
    2006    0.195822
Name: data1, dtype: float64

In [17]:
df['data1'].groupby([years, states]).mean()

2005  CA   -1.094661
      OH    0.163067
2006  CA    0.348471
      OH    0.195822
Name: data1, dtype: float64

In [18]:
df['data1'].groupby([years, states]).mean()

2005  CA   -1.094661
      OH    0.163067
2006  CA    0.348471
      OH    0.195822
Name: data1, dtype: float64

In [19]:
#frequently we find the grouping information in the same DataFrame we are working with.
#if so, we can pass the column names as group keys
df.groupby('key1').mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,-0.58492,1.326845
b,1.5,-0.542415,-0.543995


In [20]:
df.groupby('key2').mean(numeric_only=True) #we do not have 'key1' in there because it is nonnumeric (i.e. nuisance column)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.293669,-0.913249
2,-0.856827,1.172094


In [21]:
#multiple columns as keys
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.43456,-0.238092
a,2,-1.605228,2.514051
b,1,-0.976403,-0.918126
b,2,-0.108427,-0.169864


In [22]:
#another important method is .size()
df.groupby(['key1', 'key2']).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [23]:
#note: any missing values in a group key are excluded from the result by default.
#how to disable: pass dropna=False to groupby
df.groupby('key1', dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [24]:
df.groupby(['key1', 'key2'], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [25]:
#count computes the number of nonnull values in each group
df.groupby('key1').count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


In [26]:
#iterating over groups
#object returned by groupby supports iteration, creating a sequence of 2-Tuples with
#the group name and the chunk of data
#consider the following:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1  0.434560 -0.238092
1    a     2 -1.605228  2.514051
5    a  <NA> -0.584094  1.704576
b
  key1  key2     data1     data2
3    b     2 -0.108427 -0.169864
4    b     1 -0.976403 -0.918126


In [27]:
#if there are multiple keys, then the first element in the list is the tuple of key values:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2    data1     data2
0    a     1  0.43456 -0.238092
('a', 2)
  key1  key2     data1     data2
1    a     2 -1.605228  2.514051
('b', 1)
  key1  key2     data1     data2
4    b     1 -0.976403 -0.918126
('b', 2)
  key1  key2     data1     data2
3    b     2 -0.108427 -0.169864


In [28]:
#we can do anything we want with the data
#E.g.: a dictionary of the data as a one-liner
pieces = {name: group for name, group in df.groupby('key1')}
pieces['b']

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.108427,-0.169864
4,b,1,-0.976403,-0.918126


In [29]:
#we can work on any axis, although axis='index' by default
grouped = df.groupby({'key1': 'key', 'key2': 'key',
                     'data1': 'data', 'data2': 'data'}, axis='columns')

In [30]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8cb0500e20>

In [31]:
#we can print out the groups like this:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0  0.434560 -0.238092
1 -1.605228  2.514051
2  0.348471 -2.277146
3 -0.108427 -0.169864
4 -0.976403 -0.918126
5 -0.584094  1.704576
6  1.368047 -0.219631
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


In [32]:
#select column or subset of columns
#df.groupby('key1')['data1']
#or
#df.groupby('key1')[['data2']]
#are conveniences for:
#df['data1'].groupby(df['key1'])
#df[['data2']].groupby(df['key1'])
#reason: large datasets may require that only a few columns be aggregated
#compute the means for the 'data2' column only:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,-0.238092
a,2,2.514051
b,1,-0.918126
b,2,-0.169864


In [33]:
#object returned by this indexing option:
#1) if we pass a list or series we get a grouped DataFrame 
#2) if we pass a single column, we get a grouped Series
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f8cb0501030>

In [34]:
s_grouped.mean()

key1  key2
a     1      -0.238092
      2       2.514051
b     1      -0.918126
      2      -0.169864
Name: data2, dtype: float64

In [120]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steve', 'Wanda', 'Jill', 'Trey'])
people.iloc[2:3, [1, 2]] = np.nan

In [121]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.316701,0.027516,-0.636513,1.505289,-0.148749
Steve,-0.587987,0.644714,-0.650687,-0.586897,-0.049459
Wanda,-0.154651,,,0.878803,0.962772
Jill,-0.913367,-1.229924,-0.100297,0.151851,1.600257
Trey,0.467022,0.142363,-1.233588,0.520544,-1.627771


In [122]:
#Now let's suppose we have a group that corresponds to the columns
#and we want to sum the columns by group
mapping: Dict = {'a': 'red', 'b': 'red', 'c': 'blue',
                'd': 'blue', 'e': 'red', 'f': 'orange'}

In [123]:
#let's construct an array from this dictionary, but here we will just pass the dictionary
by_column = people.groupby(mapping, axis='columns')

In [124]:
by_column.sum()

Unnamed: 0,blue,red
Joe,0.868776,-0.437935
Steve,-1.237583,0.007267
Wanda,0.878803,0.808121
Jill,0.051554,-0.543034
Trey,-0.713045,-1.018386


In [125]:
#same functionality is true for Series, which we can view as a fixed-size mapping
map_series = pd.Series(mapping)


In [126]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [127]:
people.groupby(map_series, axis='columns').count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


In [128]:
#grouping with functions
#sometimes this is easier e.g. passing len instead of computing string lengths
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.316701,0.027516,-0.636513,1.505289,-0.148749
4,-0.446345,-1.087561,-1.333885,0.672395,-0.027514
5,-0.742638,0.644714,-0.650687,0.291907,0.913312


In [129]:
#mixing functions with other data structures is okay becuase these get converted to arrays
key_list : List[str] = ['one', 'one', 'one', 'two', 'two']

In [130]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.316701,0.027516,-0.636513,1.505289,-0.148749
4,two,-0.913367,-1.229924,-1.233588,0.151851,-1.627771
5,one,-0.587987,0.644714,-0.650687,-0.586897,-0.049459


In [132]:
#grouping by index levels: We can aggregate a dataset using one of the levels of its MultiIndex.
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                   [1,3,5,1,3]],
                                   names=['cty', 'tenor'])

In [133]:
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['cty', 'tenor'])

In [134]:
heir_df = pd.DataFrame(np.random.standard_normal((4, 5)),
                      columns=columns)

In [135]:
heir_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.285551,0.55978,0.227622,0.553688,1.560576
1,0.34954,-0.811207,0.305851,0.266787,1.151428
2,-0.55015,0.971595,-0.796299,1.343774,-0.254421
3,-0.98694,0.146472,-0.300453,0.317625,1.669011


In [139]:
#to group by level, pass the level number or name using the "level" keyword
heir_df.groupby(level='cty', axis='columns').count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [140]:
#same as above
heir_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [142]:
#same as above, we can also use numbers for both
heir_df.groupby(level=0, axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [143]:
#Data Aggregation: Aggregation is a data transformation that produces scalar values from arrays.
#mean, count, min, and sum are all examples of aggregations.
#we will use many aggregations of data at edgemont
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.43456,-0.238092
1,a,2.0,-1.605228,2.514051
2,,1.0,0.348471,-2.277146
3,b,2.0,-0.108427,-0.169864
4,b,1.0,-0.976403,-0.918126
5,a,,-0.584094,1.704576
6,,1.0,1.368047,-0.219631
