# Working with Data

## Groupby on Dataframe

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [2]:
import webbrowser
web = 'https://github.com/jmportilla/Udemy-notes/blob/master/Lec%2042%20-%20GroupBy%20on%20DataFrames.ipynb'
webbrowser.open(web)

True

In [7]:
# Create a Dataframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randint(0,20,size = 5),
                    'dataset2':np.random.randint(0,20,size = 5)})
dframe

Unnamed: 0,k1,k2,dataset1,dataset2
0,X,alpha,5,15
1,X,beta,3,12
2,Y,alpha,19,3
3,Y,beta,7,16
4,Z,alpha,7,13


In [8]:
# Now let's see how to use groupby

# Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

# show the groupby object
group1

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000000088614E0>

In [11]:
print(group1.sum())
print(group1.mean())

k1
X     8
Y    26
Z     7
Name: dataset1, dtype: int32
k1
X     4
Y    13
Z     7
Name: dataset1, dtype: int32


In [15]:
# We can use group keys that are series as well

# For example:

# We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

# Now using the data from dataset1, group the means by city and month
dframe['dataset1'].groupby([cities,month]).sum()

LA  FEB     3
    JAN    19
NY  FEB     7
    JAN    12
Name: dataset1, dtype: int32

In [16]:
# WE can also pass column names as group keys
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,4.0,13.5
Y,13.0,9.5
Z,7.0,13.0


In [17]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,5,15
X,beta,3,12
Y,alpha,19,3
Y,beta,7,16
Z,alpha,7,13


In [24]:
# Another useful groupby method is getting the group sizes
dframe.groupby('k2').size()

k2
alpha    3
beta     2
dtype: int64

In [23]:
# We can also iterate over groups
a = iter(dframe.groupby('k1'))
next(a)

('X',   k1     k2  dataset1  dataset2
 0  X  alpha         5        15
 1  X   beta         3        12)

In [27]:
for name,group in dframe.groupby('k1'):
    print('This iteration is {}'.format(name))
    print(group)
    print('\n')

This iteration is X
  k1     k2  dataset1  dataset2
0  X  alpha         5        15
1  X   beta         3        12


This iteration is Y
  k1     k2  dataset1  dataset2
2  Y  alpha        19         3
3  Y   beta         7        16


This iteration is Z
  k1     k2  dataset1  dataset2
4  Z  alpha         7        13




In [34]:
# We can also iterate with multiple keys
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print('1:%s , 2:%s'%(k1,k2))
    print(group)
    print('\n')

1:X , 2:alpha
  k1     k2  dataset1  dataset2
0  X  alpha         5        15


1:X , 2:beta
  k1    k2  dataset1  dataset2
1  X  beta         3        12


1:Y , 2:alpha
  k1     k2  dataset1  dataset2
2  Y  alpha        19         3


1:Y , 2:beta
  k1    k2  dataset1  dataset2
3  Y  beta         7        16


1:Z , 2:alpha
  k1     k2  dataset1  dataset2
4  Z  alpha         7        13




In [41]:
# we can also list 
list(dframe.groupby('k1'))

[('X',   k1     k2  dataset1  dataset2
  0  X  alpha         5        15
  1  X   beta         3        12), ('Y',   k1     k2  dataset1  dataset2
  2  Y  alpha        19         3
  3  Y   beta         7        16), ('Z',   k1     k2  dataset1  dataset2
  4  Z  alpha         7        13)]

In [38]:
# A possibly useful tactic is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
print(group_dict['X'])
group_dict['Z']

  k1     k2  dataset1  dataset2
0  X  alpha         5        15
1  X   beta         3        12


Unnamed: 0,k1,k2,dataset1,dataset2
4,Z,alpha,7,13


In [61]:
# We could have also chosen to do this with axis = 1

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

print(group_dict_axis1)

## *** selection
list(group_dict_axis1.keys())[0]

print(group_dict_axis1[list(group_dict_axis1.keys())[1]])

group_dict_axis1[np.dtype('int32')]


{dtype('int32'):    dataset1  dataset2
0         5        15
1         3        12
2        19         3
3         7        16
4         7        13, dtype('O'):   k1     k2
0  X  alpha
1  X   beta
2  Y  alpha
3  Y   beta
4  Z  alpha}
  k1     k2
0  X  alpha
1  X   beta
2  Y  alpha
3  Y   beta
4  Z  alpha


Unnamed: 0,dataset1,dataset2
0,5,15
1,3,12
2,19,3
3,7,16
4,7,13


In [62]:
# For example if we only wanted to group the dataset2 column with both sets of keys
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,15
X,beta,12
Y,alpha,3
Y,beta,16
Z,alpha,13


## Groupby on Dict & Series

we can groupby dict ,Series ,funtion

In [63]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [87]:
# Let's make a Dframe

animals = DataFrame(np.arange(16).reshape(4, 4),
                   columns=['W', 'X', 'Y', 'Z'],
                   index=['Dog', 'Cat', 'Bird', 'Mouse'])

#Now lets add some NAN values
animals.loc['Cat', ['W', 'Y']]= np.nan 
#animals.ix[1:2, ['W', 'Y']] 

animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [88]:
# Now let's say I had a dictionary with ebhavior values in it
behavior_map = {'W': 'good', 'X': 'bad', 'Y': 'good','Z': 'bad'}

In [89]:
# Now we can groupby using that mapping
animal_col = animals.groupby(behavior_map, axis=1)

# Show the sum accroding to the groupby with the mapping
animal_col.sum()

# For example [dog][good] = [dog][Y]+[dog][W]

Unnamed: 0,bad,good
Dog,4.0,2.0
Cat,12.0,0.0
Bird,20.0,18.0
Mouse,28.0,26.0


In [90]:
# Now let's try it with a Series
behav_series = Series(behavior_map)

behav_series

W    good
X     bad
Y    good
Z     bad
dtype: object

In [91]:
# Now let's groupby the Series

animals.groupby(behav_series, axis=1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Bird,2,2
Mouse,2,2


In [92]:
# We can also groupby with functions!
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [93]:
# Lets assume we wanted to group by the length of the animal names, we can pass the len function into groupby!

animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,12.0,13,14.0,15


In [94]:
# We can also mix functions with arrays,dicts, and Series for groupby methods

# Set a list for keys
keys = ['A', 'B', 'A', 'B']

# Now groupby length of name and the keys to show max values
animals.groupby([len, keys]).max()

Unnamed: 0,Unnamed: 1,W,X,Y,Z
3,A,0.0,1,2.0,3
3,B,,5,,7
4,A,8.0,9,10.0,11
5,B,12.0,13,14.0,15


In [95]:
# We can also use groupby with hierarchaly index levels

#Create a hierarchal column index
hier_col = pd.MultiIndex.from_arrays([['NY','NY','NY','SF','SF'],[1,2,3,1,2]],names=['City','sub_value'])

# Create a dframe with hierarchal index
dframe_hr = DataFrame(np.arange(25).reshape(5,5),columns=hier_col)

#Multiply values by 100 for clarity
dframe_hr = dframe_hr*100

#Show
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,100,200,300,400
1,500,600,700,800,900
2,1000,1100,1200,1300,1400
3,1500,1600,1700,1800,1900
4,2000,2100,2200,2300,2400
