# Aggregation Group By

In [1]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')

In [2]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [3]:
df['fare'].agg(['sum', 'mean'])

sum     28693.949300
mean       32.204208
Name: fare, dtype: float64

In [4]:
df.agg({
    'fare': ['sum', 'mean'],
    'sex': ['count']
})

Unnamed: 0,fare,sex
count,,891.0
mean,32.204208,
sum,28693.9493,


In [5]:
df.agg(
    fare_sum=('fare', 'sum'),
    fare_mean=('fare', 'mean'),
    sex_count=('sex', 'count')
)

Unnamed: 0,fare,sex
fare_sum,28693.9493,
fare_mean,32.204208,
sex_count,,891.0


In [6]:
df.groupby(['embark_town']).agg({
    'fare': ['sum', 'mean', 'median', 'min', 'max', 'std', 'var', 'mad', 'prod']
}).round(2)

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,sum,mean,median,min,max,std,var,mad,prod
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Cherbourg,10072.3,59.95,29.7,4.01,512.33,83.91,7041.39,53.02,6.193716e+250
Queenstown,1022.25,13.28,7.75,6.75,90.0,14.19,201.3,7.87,6.4586709999999994e+78
Southampton,17439.4,27.08,13.0,0.0,263.0,35.89,1287.95,21.3,0.0


In [7]:
df.groupby(['embark_town']).agg({
    'fare': ['describe']
})

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,describe,describe,describe,describe,describe,describe,describe,describe
Unnamed: 0_level_2,count,mean,std,min,25%,50%,75%,max
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Cherbourg,168.0,59.954144,83.912994,4.0125,13.69795,29.7,78.500025,512.3292
Queenstown,77.0,13.27603,14.188047,6.75,7.75,7.75,15.5,90.0
Southampton,644.0,27.079812,35.887993,0.0,8.05,13.0,27.9,263.0


In [8]:
df.groupby(['deck']).agg({
    'embark_town': ['count', 'nunique', 'size']
})

Unnamed: 0_level_0,embark_town,embark_town,embark_town
Unnamed: 0_level_1,count,nunique,size
deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,15,2,15
B,45,2,47
C,59,3,59
D,33,2,33
E,32,3,32
F,13,3,13
G,4,1,4


In [9]:
df.sort_values(by=['fare'], ascending=False) \
  .groupby(['embark_town']) \
  .agg({
      'fare': ['first', 'last'] # max, min
  })

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,first,last
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,512.3292,4.0125
Queenstown,90.0,6.75
Southampton,263.0,0.0


In [10]:
df.sort_values(by=['fare'], ascending=False) \
  .groupby(['embark_town']) \
  .agg({
      'fare': ['max', 'min'] # max, min
  })

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,max,min
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,512.3292,4.0125
Queenstown,90.0,6.75
Southampton,263.0,0.0


In [12]:
df.sort_values(by=['fare'], ascending=False) \
  .groupby(['embark_town']) \
  .agg({
      'fare': ['idxmax', 'idxmin']
  })

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,idxmax,idxmin
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2
Cherbourg,258,378
Queenstown,412,654
Southampton,88,466


In [13]:
df.loc[[258, 378]]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True


In [14]:
# shortcut
df.loc[df.groupby('class')['fare'].idxmax()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
72,0,2,male,21.0,0,0,73.5,S,Second,man,True,,Southampton,no,True
159,0,3,male,,8,2,69.55,S,Third,man,True,,Southampton,no,False


In [15]:
from scipy.stats import skew, mode

df.groupby(['embark_town']).agg({
    'fare': [skew, mode, pd.Series.mode]
})

Unnamed: 0_level_0,fare,fare,fare
Unnamed: 0_level_1,skew,mode,mode
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Cherbourg,3.305112,"([7.2292], [15])",7.2292
Queenstown,4.265111,"([7.75], [30])",7.75
Southampton,3.640276,"([8.05], [43])",8.05


In [16]:
df.groupby(['class']).agg({
    'deck': ['nunique', mode, set]
})

Unnamed: 0_level_0,deck,deck,deck
Unnamed: 0_level_1,nunique,mode,set
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,5,"([C], [59])","{nan, C, D, E, A, B}"
Second,3,"([F], [8])","{nan, E, D, F}"
Third,3,"([F], [5])","{nan, G, E, F}"


In [17]:
# custom agg functions

from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = '25%'

# ou

def percentile_25(x):
  return x.quantile(.25)

# ou

lambda_25 = lambda x: x.quantile(.25)
lambda_25.__name__ = 'lambda_25%'

In [18]:
df.groupby(['embark_town']).agg({
    'fare': [q_25, percentile_25, lambda_25, lambda x: x.quantile(.25)]
})

Unnamed: 0_level_0,fare,fare,fare,fare
Unnamed: 0_level_1,25%,percentile_25,lambda_25%,<lambda_0>
embark_town,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Cherbourg,13.69795,13.69795,13.69795,13.69795
Queenstown,7.75,7.75,7.75,7.75
Southampton,8.05,8.05,8.05,8.05


In [19]:
def count_nulls(s):
  return s.size - s.count()

df.groupby(['deck']).agg({
    'embark_town': ['count', 'nunique', 'size', count_nulls, set]
})

Unnamed: 0_level_0,embark_town,embark_town,embark_town,embark_town,embark_town
Unnamed: 0_level_1,count,nunique,size,count_nulls,set
deck,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,15,2,15,0,"{Southampton, Cherbourg}"
B,45,2,47,2,"{nan, Southampton, Cherbourg}"
C,59,3,59,0,"{Southampton, Cherbourg, Queenstown}"
D,33,2,33,0,"{Southampton, Cherbourg}"
E,32,3,32,0,"{Southampton, Cherbourg, Queenstown}"
F,13,3,13,0,"{Southampton, Cherbourg, Queenstown}"
G,4,1,4,0,{Southampton}


In [24]:
def top_10_sum(x):
  return x.nlargest(10).sum()

def bottom_10_sum(x):
  return x.nsmallest(10).sum()

df.groupby(['class']).agg({
    'fare': [top_10_sum, bottom_10_sum]
})

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,top_10_sum,bottom_10_sum
class,Unnamed: 1_level_2,Unnamed: 2_level_2
First,3361.2584,108.3709
Second,622.2376,42.0
Third,656.3374,36.1291


If you have a scenario where you want to run multiple aggregations across columns, then you may want to use the groupby combined with apply as described in this stack overflow answer.
https://stackoverflow.com/questions/14529838/apply-multiple-functions-to-multiple-groupby-columns/47103408#47103408

In [25]:
def my_summary(x):
  result = {
      'fare_sum': x['fare'].sum(),
      'fare_mean': x['fare'].mean(),
      'fare_range': x['fare'].max() - x['fare'].min()
  }
  return pd.Series(result).round(0)

df.groupby(['class']).apply(my_summary)

Unnamed: 0_level_0,fare_sum,fare_mean,fare_range
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,18177.0,84.0,512.0
Second,3802.0,21.0,74.0
Third,6715.0,14.0,70.0


Using apply with groupy gives maximum flexibility over all aspects of the results. However, there is a downside. The apply function is slow so this approach should be used sparingly.

In [27]:
df.groupby(['embark_town', 'class']).agg({
    'fare': ['sum']
}).assign(
  pct_total=lambda x: x / x.sum()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,pct_total
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,Unnamed: 3_level_1
embark_town,class,Unnamed: 2_level_2,Unnamed: 3_level_2
Cherbourg,First,8901.075,0.311947
Cherbourg,Second,431.0917,0.015108
Cherbourg,Third,740.1295,0.025939
Queenstown,First,180.0,0.006308
Queenstown,Second,37.05,0.001298
Queenstown,Third,805.2043,0.028219
Southampton,First,8936.3375,0.313183
Southampton,Second,3333.7,0.116833
Southampton,Third,5169.3613,0.181165


In [39]:
df.groupby(['embark_town', 'class']) \
  .agg({'fare': 'sum'}) \
  .groupby(level=0) \
  .cumsum() # soma cumulativa

Unnamed: 0_level_0,Unnamed: 1_level_0,fare
embark_town,class,Unnamed: 2_level_1
Cherbourg,First,8901.075
Cherbourg,Second,9332.1667
Cherbourg,Third,10072.2962
Queenstown,First,180.0
Queenstown,Second,217.05
Queenstown,Third,1022.2543
Southampton,First,8936.3375
Southampton,Second,12270.0375
Southampton,Third,17439.3988


In [40]:
df.groupby(['embark_town']).agg({'fare': 'sum'})

Unnamed: 0_level_0,fare
embark_town,Unnamed: 1_level_1
Cherbourg,10072.2962
Queenstown,1022.2543
Southampton,17439.3988


# Aggregation Crosstab

In [30]:
pd.crosstab(df['embark_town'], 
            df['class'], 
            values=df['fare'], 
            aggfunc='sum', 
            normalize=True)

class,First,Second,Third
embark_town,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cherbourg,0.311947,0.015108,0.025939
Queenstown,0.006308,0.001298,0.028219
Southampton,0.313183,0.116833,0.181165


# Aggregation Pivot Table

In [34]:
pd.pivot_table(data=df,
               index=['embark_town'],
               columns=['class'],
               aggfunc={'fare': ['mean', 'sum']})

Unnamed: 0_level_0,fare,fare,fare,fare,fare,fare
Unnamed: 0_level_1,mean,mean,mean,sum,sum,sum
class,First,Second,Third,First,Second,Third
embark_town,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Cherbourg,104.718529,25.358335,11.214083,8901.075,431.0917,740.1295
Queenstown,90.0,12.35,11.183393,180.0,37.05,805.2043
Southampton,70.364862,20.327439,14.644083,8936.3375,3333.7,5169.3613


# Aggregation Grouper

# Style