# List of Aggregation Functions(aggfunc) for GroupBy in Pandas

In [1]:
import pandas as pd

df = pd.read_csv(f'../data/earthquakes_1965_2016_database.csv.zip')
cols = ['Date', 'Time', 'Latitude', 'Longitude', 'Depth', 'Magnitude Type', 'Type', 'ID']
df = df[cols]

In [2]:
df['Date'] = pd.to_datetime(df['Date'], utc=True)
df['year_month'] = df['Date'].dt.to_period('M')



## Step 2: Pandas describe DataFrame

In [3]:
df.groupby('year_month')['Depth'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1965-01,13.0,101.115385,152.237697,15.000,20.000,35.000,95.0000,565.00
1965-02,54.0,47.712963,80.122216,10.000,20.075,25.100,34.3750,482.90
1965-03,38.0,62.055263,97.890288,10.000,25.000,31.300,47.8500,560.80
1965-04,33.0,112.163636,183.812083,10.000,25.000,30.700,64.7000,635.00
1965-05,22.0,80.972727,114.894839,10.000,25.875,35.000,105.0000,553.80
...,...,...,...,...,...,...,...,...
2016-08,35.0,107.359143,180.056549,3.250,10.000,12.000,73.5000,533.00
2016-09,46.0,40.801674,92.849418,5.557,10.000,10.195,30.3925,596.40
2016-10,36.0,82.571111,159.688648,5.630,10.000,13.230,42.7875,614.00
2016-11,44.0,72.086818,142.462439,2.090,10.000,14.905,44.7975,548.00


In [4]:
aggfuncs = [ 'count', 'sum', 'sem', 'skew', 'mean', 'min', 'max', 'std', 'quantile', 'nunique', 'mad', 'size', pd.Series.mode, 'var', 'unique']
df.groupby('year_month', dropna=False)['Depth'].agg(aggfuncs).head().T

year_month,1965-01,1965-02,1965-03,1965-04,1965-05
count,13,54,38,33,22
sum,1314.5,2576.5,2358.1,3701.4,1781.4
sem,42.22314,10.903253,15.879902,31.997576,24.495662
skew,2.744523,4.256526,4.03662,2.054374,3.604639
mean,101.115385,47.712963,62.055263,112.163636,80.972727
min,15.0,10.0,10.0,10.0,10.0
max,565.0,482.9,560.8,635.0,553.8
std,152.237697,80.122216,97.890288,183.812083,114.894839
quantile,35.0,25.1,31.3,30.7,35.0
nunique,9,30,24,22,16


In [5]:
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,count,sum,sem,skew,mean,min,max,std,quantile,nunique,mad,size,mode,var,unique
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1965-01,13,1314.500,42.223140,2.744523,101.115385,15.000,565.00,152.237697,35.000,9,95.562130,13,20.0,23176.316410,"[131.6, 80.0, 20.0, 15.0, 35.0, 95.0, 565.0, 2..."
1965-02,54,2576.500,10.903253,4.256526,47.712963,10.000,482.90,80.122216,25.100,30,39.163306,54,25.0,6419.569451,"[482.9, 15.0, 10.0, 30.3, 30.0, 25.0, 20.0, 24..."
1965-03,38,2358.100,15.879902,4.036620,62.055263,10.000,560.80,97.890288,31.300,24,53.121745,38,30.0,9582.508485,"[30.0, 40.0, 33.6, 105.2, 10.0, 15.0, 14.8, 25..."
1965-04,33,3701.400,31.997576,2.054374,112.163636,10.000,635.00,183.812083,30.700,22,129.843526,33,25.0,33786.881761,"[60.0, 10.0, 30.7, 25.0, 20.0, 39.2, 50.0, 65...."
1965-05,22,1781.400,24.495662,3.604639,80.972727,10.000,553.80,114.894839,35.000,16,66.826446,22,35.0,13200.823983,"[15.0, 22.5, 90.0, 110.0, 10.0, 50.0, 153.1, 2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08,35,3757.570,30.435112,1.662767,107.359143,3.250,533.00,180.056549,12.000,24,137.729535,35,10.0,32420.360949,"[10.0, 34.0, 270.0, 510.0, 16.37, 112.85, 11.0..."
2016-09,46,1876.877,13.689900,5.138923,40.801674,5.557,596.40,92.849418,10.195,29,43.799925,46,10.0,8621.014336,"[19.0, 10.39, 16.81, 10.0, 13.73, 8.88, 10.47,..."
2016-10,36,2972.560,26.614775,2.502732,82.571111,5.630,614.00,159.688648,13.230,23,103.017901,36,10.0,25500.464159,"[10.0, 35.0, 14.0, 128.0, 11.0, 442.0, 22.0, 4..."
2016-11,44,3171.820,21.477021,2.766644,72.086818,2.090,548.00,142.462439,14.905,30,86.027355,44,10.0,20295.546511,"[51.99, 90.0, 33.0, 10.0, 20.0, 448.76, 543.92..."


## Count, Nunique, Size, Unique

In [6]:
aggfuncs = [ 'count', 'size', 'nunique', 'unique']
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,count,size,nunique,unique
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1965-01,13,13,9,"[131.6, 80.0, 20.0, 15.0, 35.0, 95.0, 565.0, 2..."
1965-02,54,54,30,"[482.9, 15.0, 10.0, 30.3, 30.0, 25.0, 20.0, 24..."
1965-03,38,38,24,"[30.0, 40.0, 33.6, 105.2, 10.0, 15.0, 14.8, 25..."
1965-04,33,33,22,"[60.0, 10.0, 30.7, 25.0, 20.0, 39.2, 50.0, 65...."
1965-05,22,22,16,"[15.0, 22.5, 90.0, 110.0, 10.0, 50.0, 153.1, 2..."
...,...,...,...,...
2016-08,35,35,24,"[10.0, 34.0, 270.0, 510.0, 16.37, 112.85, 11.0..."
2016-09,46,46,29,"[19.0, 10.39, 16.81, 10.0, 13.73, 8.88, 10.47,..."
2016-10,36,36,23,"[10.0, 35.0, 14.0, 128.0, 11.0, 442.0, 22.0, 4..."
2016-11,44,44,30,"[51.99, 90.0, 33.0, 10.0, 20.0, 448.76, 543.92..."


## First, Last

In [7]:
aggfuncs = [ 'first', 'last']
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,first,last
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1
1965-01,131.60,55.00
1965-02,482.90,10.00
1965-03,30.00,75.00
1965-04,60.00,480.00
1965-05,15.00,150.00
...,...,...
2016-08,10.00,15.41
2016-09,19.00,10.00
2016-10,10.00,33.00
2016-11,51.99,24.00


## Sum, min, max

In [8]:
aggfuncs = [ 'sum', 'min', 'max']
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,sum,min,max
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1965-01,1314.500,15.000,565.00
1965-02,2576.500,10.000,482.90
1965-03,2358.100,10.000,560.80
1965-04,3701.400,10.000,635.00
1965-05,1781.400,10.000,553.80
...,...,...,...
2016-08,3757.570,3.250,533.00
2016-09,1876.877,5.557,596.40
2016-10,2972.560,5.630,614.00
2016-11,3171.820,2.090,548.00


## Mean, median,Mode

In [9]:
aggfuncs = [ 'mean', 'median', pd.Series.mode]
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,mean,median,mode
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1965-01,101.115385,35.000,20.0
1965-02,47.712963,25.100,25.0
1965-03,62.055263,31.300,30.0
1965-04,112.163636,30.700,25.0
1965-05,80.972727,35.000,35.0
...,...,...,...
2016-08,107.359143,12.000,10.0
2016-09,40.801674,10.195,10.0
2016-10,82.571111,13.230,10.0
2016-11,72.086818,14.905,10.0


In [10]:
df[df.year_month  == '1965-01']['Depth'].value_counts()

20.0     3
15.0     2
35.0     2
131.6    1
80.0     1
95.0     1
565.0    1
227.9    1
55.0     1
Name: Depth, dtype: int64

## MAD, STD, var

In [11]:
aggfuncs = ['mad', 'std', 'var']
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,mad,std,var
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1965-01,95.562130,152.237697,23176.316410
1965-02,39.163306,80.122216,6419.569451
1965-03,53.121745,97.890288,9582.508485
1965-04,129.843526,183.812083,33786.881761
1965-05,66.826446,114.894839,13200.823983
...,...,...,...
2016-08,137.729535,180.056549,32420.360949
2016-09,43.799925,92.849418,8621.014336
2016-10,103.017901,159.688648,25500.464159
2016-11,86.027355,142.462439,20295.546511


## Skew, Sem, quantile

In [12]:
aggfuncs = [ 'skew', 'sem', 'quantile']
df.groupby('year_month')['Depth'].agg(aggfuncs)

Unnamed: 0_level_0,skew,sem,quantile
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1965-01,2.744523,42.223140,35.000
1965-02,4.256526,10.903253,25.100
1965-03,4.036620,15.879902,31.300
1965-04,2.054374,31.997576,30.700
1965-05,3.604639,24.495662,35.000
...,...,...,...
2016-08,1.662767,30.435112,12.000
2016-09,5.138923,13.689900,10.195
2016-10,2.502732,26.614775,13.230
2016-11,2.766644,21.477021,14.905


## User defined function

In [13]:
def countna(x):
    return (x.isna()).sum()

df.groupby('year_month')['Depth'].agg([countna])

Unnamed: 0_level_0,countna
year_month,Unnamed: 1_level_1
1965-01,0
1965-02,0
1965-03,0
1965-04,0
1965-05,0
...,...
2016-08,0
2016-09,0
2016-10,0
2016-11,0


## Numpy, scipy

In [14]:
from scipy import stats

df.groupby('year_month')['Depth'].agg(lambda x: stats.mode(x)[0])

year_month
1965-01    20.0
1965-02    25.0
1965-03    30.0
1965-04    25.0
1965-05    35.0
           ... 
2016-08    10.0
2016-09    10.0
2016-10    10.0
2016-11    10.0
2016-12    10.0
Freq: M, Name: Depth, Length: 624, dtype: float64

In [15]:
import numpy as np

df.groupby('year_month')['Depth'].agg(np.count_nonzero)

year_month
1965-01    13
1965-02    54
1965-03    38
1965-04    33
1965-05    22
           ..
2016-08    35
2016-09    46
2016-10    36
2016-11    44
2016-12    53
Freq: M, Name: Depth, Length: 624, dtype: int64