# Group Transforms and “Unwrapped” GroupBys

In [40]:
import numpy as np
import pandas as pd

In [41]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,'value': np.arange(12.)})

In [42]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [43]:
g = df.groupby('key').value

In [44]:
g

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000025302F0DD48>

In [45]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [46]:
#Suppose instead we wanted to produce a Series of the same shape as df['value'] but
#with values replaced by the average grouped by 'key'. We can pass the function
#lambda x: x.mean() to transform:
g.transform(lambda x: x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [47]:
#For built-in aggregation functions, we can pass a string alias as with the GroupBy agg method:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [48]:
#Like apply, transform works with functions that return Series, but the result must be
#the same size as the input. For example, we can multiply each group by 2 using a lambda function:
g.transform(lambda x: x * 2)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

In [49]:
#As a more complicated example, we can compute the ranks in descending order for each group:
g.transform(lambda x: x.rank(ascending=False))

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

In [50]:
#Consider a group transformation function composed from simple aggregations:
def normalize(x):
    return (x - x.mean()) / x.std()
#We can obtain equivalent results in this case either using transform or apply:
g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [51]:
g.apply(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [52]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [53]:
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

# 12.3 Techniques for Method Chaining

In [1]:
#df = load_data()
#df2 = df[df['col2'] < 0]
#df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()
#result = df2.groupby('key').col1_demeaned.std()

While we’re not using any real data here, this example highlights some new methods.
First, the DataFrame.assign method is a functional alternative to column assignments
of the form df[k] = v. Rather than modifying the object in-place, it returns a
new DataFrame with the indicated modifications. So these statements are equivalent:

In [5]:
# Usual non-functional way
#df2 = df.copy()
#df2['k'] = v
# Functional assign way
#df2 = df.assign(k=v)

In [None]:
#Assigning in-place may execute faster than using assign, but assign enables easier method chaining:
result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean()).groupby('key').col1_demeaned.std())
#I used the outer parentheses to make it more convenient to add line breaks.

In [3]:
#To show callables in action, consider a fragment of the example from before:
#df = load_data()
#df2 = df[df['col2'] < 0]
#This can be rewritten as:
#df = (load_data()
#[lambda x: x['col2'] < 0])

In [None]:
#Here, the result of load_data is not assigned to a variable, so the function passed into
#[] is then bound to the object at that stage of the method chain.
#We can continue, then, and write the entire sequence as a single chained expression:

In [4]:
#result = (load_data()[lambda x: x.col2 < 0].assign(col1_demeaned=lambda x: x.col1 - x.col1.mean()).groupby('key').col1_demeaned.std())

Whether you prefer to write code in this style is a matter of taste, and splitting up the
expression into multiple steps may make your code more readable.