In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [3]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

In [4]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.660858,-0.040607
1,X,beta,1.088033,0.514591
2,Y,alpha,0.010146,1.471799
3,Y,beta,1.263325,-0.040476
4,Z,alpha,-0.053644,0.273648


In [5]:
#Now let's see how to use "groupby" to create the series groupby object

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#show the groupby object
group1

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x111601b00>

In [6]:
#Now we can perform operations on this particular group
group1.mean()

k1
X   -0.286413
Y    0.636735
Z   -0.053644
Name: dataset1, dtype: float64

In [7]:
# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [8]:
#Now using the data from dataset1: grab the dataset1 column, then group the means by city and month
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB    1.088033
    JAN    0.010146
NY  FEB    1.263325
    JAN   -0.857251
Name: dataset1, dtype: float64

In [9]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.660858,-0.040607
1,X,beta,1.088033,0.514591
2,Y,alpha,0.010146,1.471799
3,Y,beta,1.263325,-0.040476
4,Z,alpha,-0.053644,0.273648


In [10]:
# WE can also pass column names as group keys
# pass column name 'k1' as group keys
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.286413,0.236992
Y,0.636735,0.715661
Z,-0.053644,0.273648


In [11]:
# Or get the mean of multiple column names
# pass a list to method .groupby()
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-1.660858,-0.040607
X,beta,1.088033,0.514591
Y,alpha,0.010146,1.471799
Y,beta,1.263325,-0.040476
Z,alpha,-0.053644,0.273648


In [12]:
# Another useful groupby method is getting the group sizes
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [21]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print ("This is the %s group" %name)
    print (group)
    print ('\n')
    #\n new line

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -1.660858 -0.040607
1  X   beta  1.088033  0.514591


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha  0.010146  1.471799
3  Y   beta  1.263325 -0.040476


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha -0.053644  0.273648




In [22]:
dframe.groupby('k1')

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x111649278>

In [27]:
# We can also iterate with multiple keys
for (k1,k2),group in dframe.groupby(['k1','k2']):
    print ("Key 1 = %s Key 2 = %s" %(k1,k2))
    print (group)
    print ('\n')

Key 1 = X Key 2 = alpha
  k1     k2  dataset1  dataset2
0  X  alpha -1.660858 -0.040607


Key 1 = X Key 2 = beta
  k1    k2  dataset1  dataset2
1  X  beta  1.088033  0.514591


Key 1 = Y Key 2 = alpha
  k1     k2  dataset1  dataset2
2  Y  alpha  0.010146  1.471799


Key 1 = Y Key 2 = beta
  k1    k2  dataset1  dataset2
3  Y  beta  1.263325 -0.040476


Key 1 = Z Key 2 = alpha
  k1     k2  dataset1  dataset2
4  Z  alpha -0.053644  0.273648




In [28]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.660858,-0.040607
1,X,beta,1.088033,0.514591
2,Y,alpha,0.010146,1.471799
3,Y,beta,1.263325,-0.040476
4,Z,alpha,-0.053644,0.273648


In [29]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

In [30]:
group_dict

{'X':   k1     k2  dataset1  dataset2
 0  X  alpha -1.660858 -0.040607
 1  X   beta  1.088033  0.514591, 'Y':   k1     k2  dataset1  dataset2
 2  Y  alpha  0.010146  1.471799
 3  Y   beta  1.263325 -0.040476, 'Z':   k1     k2  dataset1  dataset2
 4  Z  alpha -0.053644  0.273648}

In [31]:
#Show the group with X
group_dict['X']

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.660858,-0.040607
1,X,beta,1.088033,0.514591


In [33]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

In [34]:
#we get a separation of dtype along the Oy-axis
#for example, separate numbers versus words
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -1.660858 -0.040607
 1  1.088033  0.514591
 2  0.010146  1.471799
 3  1.263325 -0.040476
 4 -0.053644  0.273648, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [35]:
# Next we'll learn how to use groupby with columns

# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]
dataset2_group


<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x111d21438>

In [36]:
dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-0.040607
X,beta,0.514591
Y,alpha,1.471799
Y,beta,-0.040476
Z,alpha,0.273648


In [37]:
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,-1.660858,-0.040607
1,X,beta,1.088033,0.514591
2,Y,alpha,0.010146,1.471799
3,Y,beta,1.263325,-0.040476
4,Z,alpha,-0.053644,0.273648


In [38]:
#compare to groupby on all columns
# Or get the mean of multiple column names
# pass a list to method .groupby()
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-1.660858,-0.040607
X,beta,1.088033,0.514591
Y,alpha,0.010146,1.471799
Y,beta,1.263325,-0.040476
Z,alpha,-0.053644,0.273648
