In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.set_option('max_columns', 8, 'max_rows', 20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,...,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,...,1905,65.0,0,0
1,1,1,4,UA,...,1333,-13.0,0,0
2,1,1,4,MQ,...,1453,35.0,0,0
3,1,1,4,AA,...,1935,-7.0,0,0
4,1,1,4,WN,...,2225,39.0,0,0


### Grouping/aggregate on single column

In [16]:
flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [17]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [18]:
type(flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean'))

pandas.core.series.Series

In [19]:
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [20]:
grouped = flights.groupby('AIRLINE')
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [27]:
try:
    flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
except ValueError as error:
    print('Only aggregate methods are allowed for agg')
    error

Only aggregate methods are allowed for agg


ValueError('Must produce aggregated value')

### grouping mutiple columns

In [31]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].sum().head(20)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
         6          21
         7          29
AS       1           0
         2           0
         3           0
         4           0
         5           0
         6           0
         7           0
B6       1           0
         2           1
         3           0
         4           0
         5           0
         6           0
Name: CANCELLED, dtype: int64

In [37]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head()

  flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786


In [42]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['SCHED_ARR', 'DIVERTED'].agg(['sum', 'min']).head(10)

  flights.groupby(['AIRLINE', 'WEEKDAY'])['SCHED_ARR', 'DIVERTED'].agg(['sum', 'min']).head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,SCHED_ARR,SCHED_ARR,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,sum,min
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,2018786,3,6,0
AA,2,1906451,4,2,0
AA,3,2082989,1,2,0
AA,4,2044815,3,5,0
AA,5,1965188,1,1,0
AA,6,1734700,6,9,0
AA,7,2015023,2,1,0
AS,1,185654,9,0,0
AS,2,155258,9,0,0
AS,3,180524,27,0,0


In [44]:
groupin_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {'CANCELLED' : ['sum', 'mean', 'size'],
            'AIR_TIME' : ['mean', 'var']}

In [51]:
flights.groupby(groupin_cols).agg(agg_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [53]:
flights.groupby(groupin_cols).sum() ## aggregation applied to remaining columns by default

Unnamed: 0_level_0,Unnamed: 1_level_0,MONTH,DAY,WEEKDAY,SCHED_DEP,...,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ATL,ABE,167,519,132,46567,...,52916,185.0,0,0
ATL,ABQ,98,189,62,25426,...,27960,4.0,0,0
ATL,ABY,136,345,87,31253,...,32758,391.0,0,0
ATL,ACY,40,72,11,8972,...,10096,64.0,0,0
ATL,AEX,275,627,160,59674,...,62361,301.0,0,0
...,...,...,...,...,...,...,...,...,...,...
SFO,SNA,780,1863,419,172954,...,191323,389.0,0,4
SFO,STL,144,307,72,24558,...,36147,362.0,0,0
SFO,SUN,59,141,47,14429,...,17203,139.0,0,0
SFO,TUS,117,254,79,30202,...,35549,311.0,0,0


### removing the multi index after groupby

In [55]:
airline_info = flights.groupby(['AIRLINE', 'WEEKDAY']) \
                        .agg({'DIST': ['sum', 'mean'], 'ARR_DELAY': ['min', 'max']}) \
                        .astype(int)

In [56]:
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [57]:
airline_info.columns

MultiIndex([(     'DIST',  'sum'),
            (     'DIST', 'mean'),
            ('ARR_DELAY',  'min'),
            ('ARR_DELAY',  'max')],
           )

In [58]:
level0 = airline_info.columns.get_level_values(0)

In [59]:
level1 = airline_info.columns.get_level_values(1)

In [60]:
level0

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [61]:
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [62]:
airline_info.columns = level0 + '_' + level1

In [63]:
airline_info.columns

Index(['DIST_sum', 'DIST_mean', 'ARR_DELAY_min', 'ARR_DELAY_max'], dtype='object')

In [64]:
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [65]:
airline_info.reset_index().head(7)

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
5,AA,6,1265340,1124,-50,858
6,AA,7,1461906,1100,-49,626


In [71]:
groupby_without_index = flights.groupby(['AIRLINE'], as_index=False)['DIST'].agg('mean').round(0)

In [72]:
groupby_without_index.index   # Index not created with column

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

In [73]:
groupby_without_index.head(7)

Unnamed: 0,AIRLINE,DIST
0,AA,1114.0
1,AS,1066.0
2,B6,1772.0
3,DL,866.0
4,EV,460.0
5,F9,970.0
6,HA,2615.0


### Using customized aggregate functions

In [74]:
college = pd.read_csv('data/college.csv')

In [79]:
college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head(10)

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0
CA,3518.0,6709.0
CO,2325.0,4670.0
CT,1874.0,2871.0
DC,2645.0,3225.0
DE,2491.0,4503.0


In [81]:
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [82]:
college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head(10)

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
CA    6.1
CO    5.0
CT    5.6
DC    2.4
DE    3.5
Name: UGDS, dtype: float64

In [85]:
college.groupby('STABBR')[['UGDS', 'SATVRMID']] \
        .agg(max_deviation).round(1).head(5)

Unnamed: 0_level_0,UGDS,SATVRMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2.6,
AL,5.8,1.6
AR,6.3,2.2
AS,,
AZ,9.9,1.9


In [87]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

  college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,...,std,max_deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,...,,,,
AK,1,1.1,123.3,132.9,,...,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,...,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,...,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,...,37.9,2.0,503.6,39.0


In [88]:
max_deviation.__name__

'max_deviation'

In [89]:
max_deviation.__name__ = 'Max deviation'

In [90]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

  college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max deviation,mean,std,Max deviation,...,std,Max deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,...,,,,
AK,1,1.1,123.3,132.9,,...,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,...,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,...,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,...,37.9,2.0,503.6,39.0


### Passing multiple args to cusotmized aggregate method

In [93]:
groupby_obj = college.groupby(['STABBR', 'RELAFFIL'])

In [94]:
import inspect
inspect.signature(groupby_obj.agg)

<Signature (func=None, *args, engine=None, engine_kwargs=None, **kwargs)>

In [95]:
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

In [99]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(pct_between_1_3k).head(5)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
Name: UGDS, dtype: float64

In [100]:
def pct_between(s, low, high): # customized boundaries
    return s.between(low, high).mean()

In [102]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(pct_between, 1000, 3000).head(5)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
Name: UGDS, dtype: float64

#### How to use multiple argument methods combined with other aggregate methods

In [108]:
try:
    college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
            .agg(['mean', pct_between], low=1000, high=3000)
except TypeError as error:
    error

TypeError("pct_between() missing 2 required positional arguments: 'low' and 'high'")

#### this problem can be solved by creating customizable functions ( decorators concept in python)

In [109]:
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    wrapper.__name__ = name
    return wrapper

In [110]:
my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

In [111]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(['mean', my_agg1, my_agg2]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,pct_1_3k,pct_10_30k
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706
