# Aggregation

## 1. 종합

### 1.1. 단일열 기준 종합

In [1]:
import pandas as pd
import numpy as np

In [2]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [3]:
flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head() # delay의 평균을 구해줘
# AIRLINE 그룹화

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [4]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head() # 데이터 프레임을 만들어

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [5]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [6]:
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

### 1.2. 복수열 기준 종합

In [7]:
flights.groupby(['AIRLINE','WEEKDAY'])['CANCELLED'].agg('sum').head()

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
Name: CANCELLED, dtype: int64

In [8]:
flights.groupby(['AIRLINE','WEEKDAY'])['CANCELLED','DIVERTED'].agg(['sum','mean']).head()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786


In [9]:
group_cols = ['ORG_AIR','DEST_AIR'] # 출발지와 도착지를 기준으로 삼을 것
agg_dict = {'CANCELLED':['sum','mean','size'], 'AIR_TIME':['mean','var']}
flights.groupby(group_cols).agg(agg_dict).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643
ATL,ACY,0,0.0,6,91.333333,11.466667
ATL,AEX,0,0.0,40,78.725,47.332692


## 2. 종합 후 처리

### 2.1. MultiIndex 제거

In [10]:
air_info = flights.groupby(group_cols).agg(agg_dict)
air_info.head() # as_index=False는 행를 만들지 않고 바로 열로 만든다

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643
ATL,ACY,0,0.0,6,91.333333,11.466667
ATL,AEX,0,0.0,40,78.725,47.332692


In [11]:
# get_level_values : 요구 된 레벨의 인덱스를 돌려줍니다.
level0 = air_info.columns.get_level_values(0)
level0

Index(['CANCELLED', 'CANCELLED', 'CANCELLED', 'AIR_TIME', 'AIR_TIME'], dtype='object')

In [12]:
level1 = air_info.columns.get_level_values(1)
level1

Index(['sum', 'mean', 'size', 'mean', 'var'], dtype='object')

In [13]:
air_info.columns = level0+'_'+level1
air_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,CANCELLED_size,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643
ATL,ACY,0,0.0,6,91.333333,11.466667
ATL,AEX,0,0.0,40,78.725,47.332692


In [14]:
air_info.reset_index().head()

Unnamed: 0,ORG_AIR,DEST_AIR,CANCELLED_sum,CANCELLED_mean,CANCELLED_size,AIR_TIME_mean,AIR_TIME_var
0,ATL,ABE,0,0.0,31,96.387097,45.778495
1,ATL,ABQ,0,0.0,16,170.5,87.866667
2,ATL,ABY,0,0.0,19,28.578947,6.590643
3,ATL,ACY,0,0.0,6,91.333333,11.466667
4,ATL,AEX,0,0.0,40,78.725,47.332692


In [15]:
air_info = flights.groupby(group_cols, as_index=False).agg(agg_dict)
air_info.head()

Unnamed: 0_level_0,ORG_AIR,DEST_AIR,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,size,mean,var
0,ATL,ABE,0,0.0,31,96.387097,45.778495
1,ATL,ABQ,0,0.0,16,170.5,87.866667
2,ATL,ABY,0,0.0,19,28.578947,6.590643
3,ATL,ACY,0,0.0,6,91.333333,11.466667
4,ATL,AEX,0,0.0,40,78.725,47.332692


### 2.2. 사용자 정의함수 사용

In [16]:
def pct_between(s):
    return s.between(-100, 60).mean()

In [17]:
flights.groupby(['AIRLINE'])['DEP_DELAY','ARR_DELAY'].agg(pct_between)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,DEP_DELAY,ARR_DELAY
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,0.916854,0.915281
AS,0.972656,0.971354
B6,0.915285,0.907919
DL,0.959249,0.958211
EV,0.91789,0.91294
F9,0.91344,0.900532
HA,0.982143,0.973214
MQ,0.897148,0.889369
NK,0.868734,0.852902
OO,0.911809,0.905282


In [18]:
flights.groupby(['AIRLINE'])['DEP_DELAY','ARR_DELAY'].agg(['mean', pct_between])

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,DEP_DELAY,DEP_DELAY,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,mean,pct_between,mean,pct_between
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AA,11.274057,0.916854,5.542661,0.915281
AS,1.808594,0.972656,-0.833333,0.971354
B6,14.287823,0.915285,8.692593,0.907919
DL,7.24245,0.959249,0.339691,0.958211
EV,9.092625,0.91789,7.03458,0.91294
F9,14.310398,0.91344,13.630651,0.900532
HA,2.571429,0.982143,4.972973,0.973214
MQ,11.094323,0.897148,6.860591,0.889369
NK,19.514401,0.868734,18.43607,0.852902
OO,9.662279,0.911809,7.593463,0.905282


In [19]:
pct_between.__name__

'pct_between'

In [20]:
pct_between.__name__ = 'Non_Delay Ratio'

In [21]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

In [22]:
flights.groupby(['AIRLINE'])['DEP_DELAY','ARR_DELAY'].agg(pct_between, -10, 10)

  """Entry point for launching an IPython kernel.


AttributeError: 'DataFrame' object has no attribute 'between'