In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key': ['a', 'b', 'c']*4,
                  'value': np.arange(12.)})

In [3]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [4]:
#group means by key
g = df.groupby('key', group_keys=False)['value']

In [5]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [19]:
%time
#What if we wanted a Series  identical in shape to df['value'] but with the values replaced by the
#weighted average grouped by 'key'. We can compute a single group mean to transform the data.
def get_mean(group):
    return group.mean()

g.transform(get_mean)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 9.3 µs


0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [20]:
%time
#built-in agg functions: We can use the GroupBy agg method
g.transform('mean')

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.3 µs


0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [21]:
%time
#transform is like apply, but the result and input must have identical size
def times_two(group):
    return group * 2

g.transform(times_two)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

In [23]:
%time
#More complicated example: compute the rankings in descending order for each group
def get_ranks(group):
    return group.rank(ascending=False)

g.transform(get_ranks)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.06 µs


0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

In [24]:
#group transformation function composed from simple aggregations
def normalize(x):
    return (x - x.mean()) / x.std()

In [25]:
%time
#equivalent results can come here from "transform" or "apply"
g.transform(normalize)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs


0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [26]:
%time
g.apply(normalize)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.82 µs


0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [27]:
%time
#built-in aggregate functions: e.g. "sum" or "mean" are typically faster than functions we apply with transform.
g.transform('mean')

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [29]:
normalized = (df['value'] - g.transform('mean')) / g.transform('std')

In [30]:
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [31]:
def normalize_min_max(column: pd.Series) -> pd.Series:
    '''Normalizes a pandas series using the min/max method'''
    return (column - column.min()) / column.max()

In [32]:
g.transform(normalize_min_max)

0     0.000000
1     0.000000
2     0.000000
3     0.333333
4     0.300000
5     0.272727
6     0.666667
7     0.600000
8     0.545455
9     1.000000
10    0.900000
11    0.818182
Name: value, dtype: float64

In [37]:
df2 = pd.DataFrame({'key': ['a', 'b', 'c', 'd', 'e']*10,
                  'value': np.arange(50.)})

In [38]:
df2

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,d,3.0
4,e,4.0
5,a,5.0
6,b,6.0
7,c,7.0
8,d,8.0
9,e,9.0


In [53]:
normalized_values_g = df2.groupby('key', group_keys=False)['value']

In [54]:
normalized_values_g

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fa19c0bc280>

In [55]:
normalized_values_g.transform(normalize_min_max)

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.111111
6     0.108696
7     0.106383
8     0.104167
9     0.102041
10    0.222222
11    0.217391
12    0.212766
13    0.208333
14    0.204082
15    0.333333
16    0.326087
17    0.319149
18    0.312500
19    0.306122
20    0.444444
21    0.434783
22    0.425532
23    0.416667
24    0.408163
25    0.555556
26    0.543478
27    0.531915
28    0.520833
29    0.510204
30    0.666667
31    0.652174
32    0.638298
33    0.625000
34    0.612245
35    0.777778
36    0.760870
37    0.744681
38    0.729167
39    0.714286
40    0.888889
41    0.869565
42    0.851064
43    0.833333
44    0.816327
45    1.000000
46    0.978261
47    0.957447
48    0.937500
49    0.918367
Name: value, dtype: float64

In [56]:
normalized_values_g.apply(normalize_min_max)

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.111111
6     0.108696
7     0.106383
8     0.104167
9     0.102041
10    0.222222
11    0.217391
12    0.212766
13    0.208333
14    0.204082
15    0.333333
16    0.326087
17    0.319149
18    0.312500
19    0.306122
20    0.444444
21    0.434783
22    0.425532
23    0.416667
24    0.408163
25    0.555556
26    0.543478
27    0.531915
28    0.520833
29    0.510204
30    0.666667
31    0.652174
32    0.638298
33    0.625000
34    0.612245
35    0.777778
36    0.760870
37    0.744681
38    0.729167
39    0.714286
40    0.888889
41    0.869565
42    0.851064
43    0.833333
44    0.816327
45    1.000000
46    0.978261
47    0.957447
48    0.937500
49    0.918367
Name: value, dtype: float64