In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.759171,0.743356,X,alpha
1,0.634783,0.800429,X,beta
2,-0.357208,0.911259,Y,alpha
3,-1.910981,0.084336,Y,beta
4,1.947184,-0.933854,Z,alpha


In [3]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#Show the groupby object
group1

<pandas.core.groupby.SeriesGroupBy object at 0x0000008156757898>

In [4]:
#Now we can perform operations on this particular group
group1.mean()

k1
X    0.696977
Y   -1.134094
Z    1.947184
Name: dataset1, dtype: float64

In [6]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.696977,0.771892
Y,-1.134094,0.497798
Z,1.947184,-0.933854


In [7]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,0.759171,0.743356
X,beta,0.634783,0.800429
Y,alpha,-0.357208,0.911259
Y,beta,-1.910981,0.084336
Z,alpha,1.947184,-0.933854


In [8]:
# Another useful groupby method is getting the group sizes
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [10]:
dframe.groupby(['k2']).size()

k2
alpha    3
beta     2
dtype: int64

In [11]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.759171,0.743356,X,alpha
1,0.634783,0.800429,X,beta
2,-0.357208,0.911259,Y,alpha
3,-1.910981,0.084336,Y,beta
4,1.947184,-0.933854,Z,alpha


In [17]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print("This is {} group" .format(name))
    print (group)
    print ('\n')

This is X group
   dataset1  dataset2 k1     k2
0  0.759171  0.743356  X  alpha
1  0.634783  0.800429  X   beta


This is Y group
   dataset1  dataset2 k1     k2
2 -0.357208  0.911259  Y  alpha
3 -1.910981  0.084336  Y   beta


This is Z group
   dataset1  dataset2 k1     k2
4  1.947184 -0.933854  Z  alpha




In [18]:
# We can also iterate with multiple keys
for (k1,k2), group in dframe.groupby(['k1','k2']):
    print("K1 = %s, K2= %s" %(k1,k2))
    print (group)
    print("\n")

K1 = X, K2= alpha
   dataset1  dataset2 k1     k2
0  0.759171  0.743356  X  alpha


K1 = X, K2= beta
   dataset1  dataset2 k1    k2
1  0.634783  0.800429  X  beta


K1 = Y, K2= alpha
   dataset1  dataset2 k1     k2
2 -0.357208  0.911259  Y  alpha


K1 = Y, K2= beta
   dataset1  dataset2 k1    k2
3 -1.910981  0.084336  Y  beta


K1 = Z, K2= alpha
   dataset1  dataset2 k1     k2
4  1.947184 -0.933854  Z  alpha




In [21]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.759171,0.743356,X,alpha
1,0.634783,0.800429,X,beta


In [22]:
group_dict['Y']

Unnamed: 0,dataset1,dataset2,k1,k2
2,-0.357208,0.911259,Y,alpha
3,-1.910981,0.084336,Y,beta


In [23]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  0.759171  0.743356
 1  0.634783  0.800429
 2 -0.357208  0.911259
 3 -1.910981  0.084336
 4  1.947184 -0.933854, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [25]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,0.743356
X,beta,0.800429
Y,alpha,0.911259
Y,beta,0.084336
Z,alpha,-0.933854
