## pandas 그룹연산

데이터를 그룹별로 파악해야 할 경우가 많이 있다. 예를 들어, 어떤 수업에서 남성과 여성간의 학점차이를 살펴보고 싶다면, 성별에 따라 데이터를 분리하여, 각각 평균과 표준편차를 구하여 두 집단 사이의 유의미한 차이점이 있는지 확인하면 될 것이다. 만약 데이터에 속한 집단의 수가 많아진다면, 이러한 그룹별 연산을 수작업으로 하는것은 매우 귀찮을 일이 될 것이다. pandas에서는 데이터프레임 객체에 속한 groupby라는 메서드를 이용하여 그룹별 데이터 분석을 수행할 수 있다. 


In [1]:
#import the pandas library
import pandas as pd

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)

df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,Kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


## 1. 그룹으로 묶기

In [2]:
# 팀별로 데이터프레임을 묶고싶다.
df.groupby('Team') #출력시 특별한 결과를 출력하진 않는다.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f4d142ed5f8>

### 그룹 살펴보기

In [3]:
df_grouped=df.groupby('Team')
df_grouped.groups # df.groupby('Team').groups

{'Devils': Int64Index([2, 3], dtype='int64'),
 'Kings': Int64Index([4, 5, 6, 7], dtype='int64'),
 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'),
 'Royals': Int64Index([9, 10], dtype='int64')}

### 여러개의 기준으로 그룹 묶기

In [4]:
df.groupby(['Team','Year']).groups

{('Devils', 2014): Int64Index([2], dtype='int64'),
 ('Devils', 2015): Int64Index([3], dtype='int64'),
 ('Kings', 2014): Int64Index([4], dtype='int64'),
 ('Kings', 2015): Int64Index([5], dtype='int64'),
 ('Kings', 2016): Int64Index([6], dtype='int64'),
 ('Kings', 2017): Int64Index([7], dtype='int64'),
 ('Riders', 2014): Int64Index([0], dtype='int64'),
 ('Riders', 2015): Int64Index([1], dtype='int64'),
 ('Riders', 2016): Int64Index([8], dtype='int64'),
 ('Riders', 2017): Int64Index([11], dtype='int64'),
 ('Royals', 2014): Int64Index([9], dtype='int64'),
 ('Royals', 2015): Int64Index([10], dtype='int64')}

### 그룹 순회하기

In [5]:
for i in df.groupby('Year'):
    print(i)

(2014,      Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701)
(2015,       Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    Kings     4  2015     812
10  Royals     1  2015     804)
(2016,      Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694)
(2017,       Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690)


In [6]:
x=[(1,2),(1,3),(2,4)]
for i,j in x:
    print(i)
    print(j)

1
2
1
3
2
4


In [7]:
for name, group in df.groupby('Year'):
    print(name)

2014
2015
2016
2017


In [8]:
for name, group in df.groupby('Year'):
    print(name)
    print(group)

2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701
2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    Kings     4  2015     812
10  Royals     1  2015     804
2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694
2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690


### 그룹 선택하기

In [9]:
df.groupby('Year').get_group(2014)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
2,Devils,2,2014,863
4,Kings,3,2014,741
9,Royals,4,2014,701


## 2. 그룹별 연산: Aggregation
이전 예제에서 Aggregation은 다수의 데이터로부터 요약된 하나의 값을 계산하는데 사용하였다. 마찬가지로 groupby와 함께 aggregation을 쓰면, 그룹단위로 요약된 하나의 값을 계산한다. 예를들면 다음과 같다:
    - 그룹의 평균, 표준편차
    - 그룹의 크기, 갯수

In [10]:
import numpy as np
df.groupby('Year').agg(np.mean)

Unnamed: 0_level_0,Rank,Points
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,2.5,795.25
2015,2.5,769.5
2016,1.5,725.0
2017,1.5,739.0


In [11]:
df.groupby('Team').agg(np.size)

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2,2,2
Kings,4,4,4
Riders,4,4,4
Royals,2,2,2


In [12]:
df.groupby('Team')['Points'].agg(np.size)

Team
Devils    2
Kings     4
Riders    4
Royals    2
Name: Points, dtype: int64

In [13]:
df.groupby('Team')['Points'].agg([np.size,np.mean,np.std])

Unnamed: 0_level_0,size,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2,768.0,134.350288
Kings,4,774.25,31.899582
Riders,4,762.25,88.567771
Royals,2,752.5,72.831998


## 3. 그룹단위연산: Transform
aggregation과 달리 원래 데이터프레임을 유지한다. 그룹별 대표갑을 계산하고, 그룹에 속한 모든 데이터가 동일한 대표값을 갖도록 계산한다. 

In [14]:
df.groupby('Team').transform(np.mean)

Unnamed: 0,Rank,Year,Points
0,1.75,2015.5,762.25
1,1.75,2015.5,762.25
2,2.5,2014.5,768.0
3,2.5,2014.5,768.0
4,2.25,2015.5,774.25
5,2.25,2015.5,774.25
6,2.25,2015.5,774.25
7,2.25,2015.5,774.25
8,1.75,2015.5,762.25
9,2.5,2014.5,752.5


In [15]:
def standardize(x):
    return (x-np.mean(x))/x.std()

In [16]:
df.groupby('Team').transform(standardize)

Unnamed: 0,Rank,Year,Points
0,-1.5,-1.161895,1.284327
1,0.5,-0.387298,0.302029
2,-0.707107,-0.707107,0.707107
3,0.707107,0.707107,-0.707107
4,0.5,-1.161895,-1.042333
5,1.166667,-0.387298,1.183401
6,-0.833333,0.387298,-0.572108
7,-0.833333,1.161895,0.43104
8,0.5,0.387298,-0.770596
9,0.707107,-0.707107,-0.707107


## 4. 그루핑 실전

In [17]:
import pandas as pd
import numpy as np
df = pd.read_csv('census.csv')
df = df[df['SUMLEV']==50]
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


### 그룹으로 묶을때에는 groupby를 써라

In [18]:
for group, frame in df.groupby('STNAME'):
    avg = np.average(frame['CENSUS2010POP'])
    print('Counties in state ' + group + ' have an average population of ' + str(avg))

Counties in state Alabama have an average population of 71339.34328358209
Counties in state Alaska have an average population of 24490.724137931036
Counties in state Arizona have an average population of 426134.4666666667
Counties in state Arkansas have an average population of 38878.90666666667
Counties in state California have an average population of 642309.5862068966
Counties in state Colorado have an average population of 78581.1875
Counties in state Connecticut have an average population of 446762.125
Counties in state Delaware have an average population of 299311.3333333333
Counties in state District of Columbia have an average population of 601723.0
Counties in state Florida have an average population of 280616.5671641791
Counties in state Georgia have an average population of 60928.63522012578
Counties in state Hawaii have an average population of 272060.2
Counties in state Idaho have an average population of 35626.86363636364
Counties in state Illinois have an average populat

### 사용자 정의함수와 함께 그루핑도 가능

In [19]:
df = df.set_index('STNAME')

#인덱스 기준으로 사용자 정의함수를 적용할수 있다.
def fun(item):
    if item[0]<'M': #인덱스 값의 처음 시작값이 M보다 작다면
        return 0
    if item[0]<'Q': #인덱스 값의 처음 시작값이 Q보다 작다면
        return 1
    return 2 #그렇지 않다면

for group, frame in df.groupby(fun):
    print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.')


There are 1177 records in group 0 for processing.
There are 1134 records in group 1 for processing.
There are 831 records in group 2 for processing.


## 5. 그룹단위 연산 실전

In [20]:
df = pd.read_csv('census.csv')
df = df[df['SUMLEV']==50]

In [21]:
df.groupby('STNAME').agg({'CENSUS2010POP': np.average})

Unnamed: 0_level_0,CENSUS2010POP
STNAME,Unnamed: 1_level_1
Alabama,71339.343284
Alaska,24490.724138
Arizona,426134.466667
Arkansas,38878.906667
California,642309.586207
Colorado,78581.1875
Connecticut,446762.125
Delaware,299311.333333
District of Columbia,601723.0
Florida,280616.567164


In [22]:
print(type(df.groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']))
print(type(df.groupby(level=0)['POPESTIMATE2010']))

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<class 'pandas.core.groupby.generic.SeriesGroupBy'>


In [23]:
(df.set_index('STNAME').groupby(level=0)['CENSUS2010POP']
    .agg({'avg': np.average, 'sum': np.sum}))

is deprecated and will be removed in a future version
  


Unnamed: 0_level_0,avg,sum
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,71339.343284,4779736
Alaska,24490.724138,710231
Arizona,426134.466667,6392017
Arkansas,38878.906667,2915918
California,642309.586207,37253956
Colorado,78581.1875,5029196
Connecticut,446762.125,3574097
Delaware,299311.333333,897934
District of Columbia,601723.0,601723
Florida,280616.567164,18801310


In [24]:
(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']
    .agg({'avg': np.average, 'sum': np.sum}))

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,avg,avg,sum,sum
Unnamed: 0_level_1,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2010,POPESTIMATE2011
STNAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Alabama,71420.313433,71658.328358,4785161,4801108
Alaska,24621.413793,24921.37931,714021,722720
Arizona,427213.866667,431248.8,6408208,6468732
Arkansas,38965.253333,39180.506667,2922394,2938538
California,643691.017241,650000.586207,37334079,37700034
Colorado,78878.96875,79991.875,5048254,5119480
Connecticut,447464.625,448719.875,3579717,3589759
Delaware,299930.333333,302638.666667,899791,907916
District of Columbia,605126.0,620472.0,605126,620472
Florida,281341.641791,285157.208955,18849890,19105533


In [25]:
(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']
    .agg({'POPESTIMATE2010': np.average, 'POPESTIMATE2011': np.sum}))

Unnamed: 0_level_0,POPESTIMATE2010,POPESTIMATE2011
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,71420.313433,4801108
Alaska,24621.413793,722720
Arizona,427213.866667,6468732
Arkansas,38965.253333,2938538
California,643691.017241,37700034
Colorado,78878.96875,5119480
Connecticut,447464.625,3589759
Delaware,299930.333333,907916
District of Columbia,605126.0,620472
Florida,281341.641791,19105533
