# GroupBy 메카닉

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [2]:
# 그룹 연산-1. 분리: 객체에 들어 있는 데이터를 하나 이상의 키를 기준으로 분리
#           2. 적용: 함수를 각 그룹에 적용시켜 새로운 값을 얻어 냄 
#           3. 결합: 함수를 적용한 결과를 하나의 객체로 결합

In [3]:
# DataFrame으로 표현되는 간단한 표 형식의 데이터

df=pd.DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.734206,1.883985
1,a,two,1.480034,-1.285723
2,b,one,-0.477649,0.80135
3,b,two,2.320956,-0.805204
4,a,one,0.116809,-1.322337


In [6]:
grouped=df['data1'].groupby(df['key1']) # grouped변수는 GroupBy객체, 그룹 연산을 위해 필요한 모든 정보를 가지고 있다.
grouped 

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001E6D20D6D90>

In [7]:
grouped.mean() # key1로 묶고 각 그룹에서 data1의 평균을 구하기
               # 데이터(Series객체)가 그룹 색인에 따라 수집되고 key1컬럼에 있는 유일한 값으로 색인되는 새로운 Series객체 생성

key1
a    0.287545
b    0.921653
Name: data1, dtype: float64

In [9]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean() # 여러 개의 배열을 리스트로 넘겼을 때
means # 계층적 색인을 가지는 Series를 얻음

key1  key2
a     one    -0.308699
      two     1.480034
b     one    -0.477649
      two     2.320956
Name: data1, dtype: float64

In [10]:
means.unstack() 

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.308699,1.480034
b,-0.477649,2.320956


In [11]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])

In [12]:
df['data1'].groupby([states,years]).mean()

California  2005    1.480034
            2006   -0.477649
Ohio        2005    0.793375
            2006    0.116809
Name: data1, dtype: float64

In [13]:
df.groupby('key1').mean() # 컬럼 이름을 넘겨 그룹의 색인으로 사용

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.287545,-0.241358
b,0.921653,-0.001927


In [14]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.308699,0.280824
a,two,1.480034,-1.285723
b,one,-0.477649,0.80135
b,two,2.320956,-0.805204


In [15]:
df.groupby(['key1','key2']).size() # size메서드는 그룹의 크기를 담고있는 Series 반환, 누락된 값은 결과에 제외

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

* 그룹 간 순회하기

In [16]:
for name, group in df.groupby('key1'): # 그룹 이름과 그에 따른 데이터 묶음을 튜플로 반환
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.734206  1.883985
1    a  two  1.480034 -1.285723
4    a  one  0.116809 -1.322337
b
  key1 key2     data1     data2
2    b  one -0.477649  0.801350
3    b  two  2.320956 -0.805204


In [17]:
for (k1,k2), group in df.groupby(['key1','key2']): # 색인이 여럿 존재하는 경우는 튜플의 첫번째 원소가 색인값
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.734206  1.883985
4    a  one  0.116809 -1.322337
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.480034 -1.285723
('b', 'one')
  key1 key2     data1    data2
2    b  one -0.477649  0.80135
('b', 'two')
  key1 key2     data1     data2
3    b  two  2.320956 -0.805204


In [18]:
pieces=dict(list(df.groupby('key1'))) # 원하는 데이터만 골라내기: 그룹별 사전형으로 쉽게 바꿔 사용
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.477649,0.80135
3,b,two,2.320956,-0.805204


In [19]:
df.dtypes 

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [22]:
grouped=df.groupby(df.dtypes,axis=1) # df의 컬럼을 dtype에 따라 묶기

In [24]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -0.734206  1.883985
1  1.480034 -1.285723
2 -0.477649  0.801350
3  2.320956 -0.805204
4  0.116809 -1.322337
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


* 컬럼이나 컬럼의 일부만 선택하기

In [26]:
df.groupby(['key1','key2'])[['data2']].mean() # data2 컬럼에 대해서만 평균을 구하고 결과를 DataFrame으로 반환

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.280824
a,two,-1.285723
b,one,0.80135
b,two,-0.805204


In [27]:
s_grouped=df.groupby(['key1','key2'])['data2']
s_grouped # 단일 값으로 하나의 컬럼 이름만 넘겼을 경우 SeriesGroupBy 객체가 됨

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001E6D21FF970>

In [28]:
s_grouped.mean()

key1  key2
a     one     0.280824
      two    -1.285723
b     one     0.801350
      two    -0.805204
Name: data2, dtype: float64

* 사전과 Series에서 그루핑하기

In [30]:
people =pd.DataFrame(np.random.randn(5,5),
                    columns=['a','b','c','d','e'],
                    index=['Joe','Steve','Wes','Jim','Travis'])

In [31]:
people.iloc[2:3,[1,2]] =np.nan

In [32]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.244047,-0.646148,1.76449,-1.061129,1.235845
Steve,-0.6325,1.224451,-0.518803,0.280161,0.389693
Wes,0.904254,,,-1.293899,0.56402
Jim,-0.61192,-1.216418,0.618194,-0.732962,1.407721
Travis,0.635747,1.863093,-0.205991,1.917633,-1.214859


In [33]:
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} 

In [36]:
by_column=people.groupby(mapping,axis=1) # 사전을 groupby메서드로 넘기기
by_column.sum()

Unnamed: 0,blue,red
Joe,0.70336,0.833744
Steve,-0.238642,0.981644
Wes,-1.293899,1.468274
Jim,-0.114768,-0.420617
Travis,1.711642,1.283981


In [37]:
map_series=pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [38]:
people.groupby(map_series,axis=1).sum() # Series도 같은 기능 수행 가능, 고정된 크기의 맵이라고 볼 수 있다.

Unnamed: 0,blue,red
Joe,0.70336,0.833744
Steve,-0.238642,0.981644
Wes,-1.293899,1.468274
Jim,-0.114768,-0.420617
Travis,1.711642,1.283981


* 함수로 그룹핑하기

In [39]:
people.groupby(len).sum() # 이름의 길이별로 그룹을 묶은 후 sum

Unnamed: 0,a,b,c,d,e
3,0.536381,-1.862566,2.382684,-3.08799,3.207586
5,-0.6325,1.224451,-0.518803,0.280161,0.389693
6,0.635747,1.863093,-0.205991,1.917633,-1.214859


In [40]:
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min() # 함수를 배열, 사전 또는 Series와 섞어 쓰기 가능

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.244047,-0.646148,1.76449,-1.293899,0.56402
3,two,-0.61192,-1.216418,0.618194,-0.732962,1.407721
5,one,-0.6325,1.224451,-0.518803,0.280161,0.389693
6,two,0.635747,1.863093,-0.205991,1.917633,-1.214859


* 색인 단계로 그룹핑하기

In [42]:
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
                                  names=['cty','tenor'])

In [43]:
hier_df=pd.DataFrame(np.random.randn(4,5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-2.00903,0.361691,0.399129,0.429639,1.175837
1,-1.037055,-0.985857,-0.682482,1.065909,0.712283
2,-0.049134,-0.174243,1.381494,0.154226,2.666353
3,0.894803,1.548023,0.535767,-0.275265,-1.428094


In [44]:
hier_df.groupby(level='cty',axis=1).count() # level옵션으로 축 색인 단계 중 하나를 사용하여 편리하게 집계

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# 데이터 집계

: 배열로부터 스칼라값을 만들어내는 모든 데이터 변환 작업

In [45]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.734206,1.883985
1,a,two,1.480034,-1.285723
2,b,one,-0.477649,0.80135
3,b,two,2.320956,-0.805204
4,a,one,0.116809,-1.322337


In [46]:
grouped=df.groupby('key1')

In [48]:
grouped['data1'].quantile(0.9) # quatile 메서드는 %분위수를 계산
                               # 내부적으로 GroupBy는 Series를 효과적으로 잘게 자르고 각 조각에 대해 piece.quantile(0.9) 호춯

key1
a    1.207389
b    2.041096
Name: data1, dtype: float64

In [49]:
# 자신만의 데이터 집계함수를 사용하려면 배열의 aggregate나 agg메서드에 해당 함수를 넘긴다.

def peak_to_peak(arr):
    return arr.max()-arr.min()

In [51]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.21424,3.206322
b,2.798606,1.606554


In [52]:
grouped.describe() # describe 같은 메서드는 데이터를 집계하지 않는데도 잘 작동

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.287545,1.11695,-0.734206,-0.308699,0.116809,0.798421,1.480034,3.0,-0.241358,1.840692,-1.322337,-1.30403,-1.285723,0.299131,1.883985
b,2.0,0.921653,1.978913,-0.477649,0.222002,0.921653,1.621305,2.320956,2.0,-0.001927,1.136005,-0.805204,-0.403566,-0.001927,0.399711,0.80135


* 컬럼에 여러가지 함수 적용

In [55]:
tips=pd.read_csv('examples/tips.csv') # read_csv함수로 데이터를 불러온 후 다음 팁의 비율을 담기 위한 컬럼인 tip_pct를 추가

In [56]:
tips['tip_pct']=tips['tip']/tips['total_bill'] # total_bill에서 팁의 비율 추가
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [57]:
# Series나 DF의 모든 컬럼을 집계하는 것은 mean같은 메서드나 원하는 함수에 aggregate사용
# 하지만 컬럼에 따라 다른 함수를 사용하여 집계를 수행하거나 여러 개의 함수를 한 번에 적용하기 원한다면?

grouped=tips.groupby(['day','smoker']) #tips를 day와 smoker별로 묶기

In [58]:
grouped_pct=grouped['tip_pct']

In [59]:
grouped_pct.agg('mean') # 기술 통계에서는 함수 이름을 문자열로 넘기기

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [60]:
grouped_pct.agg(['mean','std',peak_to_peak]) # 함수 목록이나 함수 이름을 넘기면 함수 이름을 컬럼 이름으로 하는 DataFrame 반환

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [62]:
grouped_pct.agg([('foo','mean'),('bar',np.std)])# 이름과 함수가 담긴 (name, func) 튜플 리스트를 넘기면 첫번째 원소가 컬럼 이름으로 사용

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [67]:
# DataFrame은 컬럼마다 다른 함수를 적용하거나 여러 개의 함수를 모든 컬럼에 적용할 수 있다.

functions=['count','mean','max']

In [70]:
result=grouped['tip_pct','total_bill'].agg(functions)
result # 각 컬럼을 따로 계산-> concat메서드로 keys인자로 컬럼 이름을 넘겨-> 이어붙이는 것과 동일

  result=grouped['tip_pct','total_bill'].agg(functions)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [71]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [80]:
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)] # 튜플로 넘기기 가능

In [81]:
grouped['tip_pct','total_bill'].agg(ftuples)

  grouped['tip_pct','total_bill'].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [82]:
grouped.agg({'tip':np.max, 'size':'sum'}) # agg메서드에 컬럼 이름에 대응하는 함수가 들어있는 사전을 넘기면 컬럼마다 다른 함수를 적용

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [83]:
grouped.agg({'tip_pct':['min','max','mean','std'], 'size':'sum'}) # 단 하나의 컬럼에라도 여러 개의 함수가 적용디면 DF는 계층적인 컬럼

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


* 색인되지 않은 형태로 집계된 데이터 반환

In [84]:
tips.groupby(['day','smoker'], as_index=False).mean() # as_index=False를 넘겨 유일한 그룹키 조합으로 색인되어 반환되지 않게 함

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


# Apply: 일반적인 분리-적용-병합

: apply메서드는 객체를 여러 조각으로 나누고, 전달된 함수를 각 조각에 일괄 적용한 후 이를 다시 합친다.

In [85]:
def top(df, n=5, column='tip_pct'): 
    return df.sort_values(by=column)[-n:] # 특정 컬럼에서 가장 큰 값을 가지는 로우를 선택하는 함수

In [86]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [87]:
tips.groupby('smoker').apply(top) # 흡연자 그룹에 대해 top을 apply함

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [88]:
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') # apply로 넘길 함수가 추가적인 인자를 받는다면 함수 이름 뒤에

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [90]:
result=tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [91]:
result.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

* 그룹 색인 생략

In [95]:
tips.groupby('smoker',group_keys=False).apply(top)
# groupby메서드에 group_keys=False를 넘겨 반환된 객체에 원본 객체의 각 조각에 대한 색인과 그룹 키가 계층적 색인으로 사용되지 않게 함

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


* 변위치 분석과 버킷 분석

In [96]:
# pandas의 cut과 qcut을 사용하면 선택한 크기만큼 혹은 표본 변위치에 따라 데이터를 나눌 수 있다.

frame=pd.DataFrame({'data1':np.random.randn(1000),
                   'data2':np.random.randn(1000)})

In [97]:
quartiles=pd.cut(frame.data1, 4)
quartiles[:10] 

0    (-3.086, -1.387]
1      (0.305, 1.997]
2     (-1.387, 0.305]
3      (0.305, 1.997]
4     (-1.387, 0.305]
5     (-1.387, 0.305]
6     (-1.387, 0.305]
7     (-1.387, 0.305]
8     (-1.387, 0.305]
9      (0.305, 1.997]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.086, -1.387] < (-1.387, 0.305] < (0.305, 1.997] < (1.997, 3.69]]

In [98]:
def get_stats(group):
    return{'min':group.min(),
          'max':group.max(),
          'count':group.count(),
          'mean':group.mean()}

In [99]:
grouped=frame.data2.groupby(quartiles) # cut에서 반환된 Categorical객체는 groupby로 넘길 수 있다.

In [100]:
grouped.apply(get_stats).unstack() # 등간격 버킷

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.086, -1.387]",-2.102911,2.458016,78.0,0.165879
"(-1.387, 0.305]",-2.925143,4.060299,523.0,0.073939
"(0.305, 1.997]",-2.928194,2.424154,370.0,-0.034382
"(1.997, 3.69]",-2.377398,1.552067,29.0,0.102081


In [101]:
# 표본 변위치에 기반하여 크기가 같은 버킷을 계산하려면 qcut사용

grouping=pd.qcut(frame.data1, 10, labels=False) # labels=False를 넘겨 변위치 숫자 구하기

In [102]:
grouped=frame.data2.groupby(grouping)

In [103]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.102911,2.458016,100.0,0.139355
1,-1.733515,3.52626,100.0,0.110105
2,-2.120488,3.105238,100.0,0.168498
3,-2.925143,4.060299,100.0,-0.032686
4,-1.800074,2.542062,100.0,0.011366
5,-2.500974,3.289953,100.0,0.115747
6,-1.975285,2.424154,100.0,-0.075893
7,-2.910824,2.347966,100.0,0.094431
8,-2.928194,1.813454,100.0,-0.115511
9,-2.687388,2.026199,100.0,0.003064


* 예제: 그룹에 따른 값으로 결측치 채우기

In [104]:
# 누락된 값을 평균값으로 대체하는 예제

s=pd.Series(np.random.randn(6))

In [105]:
s[::2]=np.nan
s

0         NaN
1    0.558256
2         NaN
3    1.124158
4         NaN
5   -0.499001
dtype: float64

In [106]:
s.fillna(s.mean()) # fillna메서드를 사용하여 누락된 값을 고정된 값이나 혹은 데이터로부터 도출된 값으로 채움

0    0.394471
1    0.558256
2    0.394471
3    1.124158
4    0.394471
5   -0.499001
dtype: float64

In [107]:
# 그룹별로 채워 넣고 싶은 값이 다를 때

states=['Ohio','New York','Vermont','Florida','Oregan','Nevada','California','Idaho']

In [108]:
group_key=['East']*4 + ['West']*4 # [East]리스트 안에 있는 네 벌의 원소를 이어붙인다. 

In [109]:
data=pd.Series(np.random.randn(8),index=states)
data

Ohio         -0.030587
New York     -1.325420
Vermont      -0.530526
Florida      -1.115701
Oregan        1.138898
Nevada        2.511588
California   -1.145290
Idaho         0.300700
dtype: float64

In [110]:
data[['Vermont','Nevada','Idaho']]=np.nan
data

Ohio         -0.030587
New York     -1.325420
Vermont            NaN
Florida      -1.115701
Oregan        1.138898
Nevada             NaN
California   -1.145290
Idaho              NaN
dtype: float64

In [111]:
data.groupby(group_key).mean()

East   -0.823903
West   -0.003196
dtype: float64

In [112]:
fill_mean=lambda g:g.fillna(g.mean()) 

In [113]:
data.groupby(group_key).apply(fill_mean) # 데이터를 그룹으로 나누고 apply함수를 사용해 각 그룹에 대해 fillna적용

Ohio         -0.030587
New York     -1.325420
Vermont      -0.823903
Florida      -1.115701
Oregan        1.138898
Nevada       -0.003196
California   -1.145290
Idaho        -0.003196
dtype: float64

In [114]:
# 그룹에 따라 미리 정의된 다른 값으로 채워 넣어야 할 경우

fill_values={'East':0.5, 'West':-1}

In [115]:
fill_func = lambda g: g.fillna(fill_values[g.name]) # name속성을 이용

In [116]:
data.groupby(group_key).apply(fill_func)

Ohio         -0.030587
New York     -1.325420
Vermont       0.500000
Florida      -1.115701
Oregan        1.138898
Nevada       -1.000000
California   -1.145290
Idaho        -1.000000
dtype: float64

* 예제: 랜덤 표본과 순열

In [119]:
# 대용량의 데이터를 몬테카를로 시뮬레이션이나 다른 애플리케이션에 사용하기 위해 랜덤 표본을 뽑아낸다고 생각해보자
# Series의 sample메서드를 사용

suits=['H','S','C','D'] # 하트, 스페이스, 클럽, 다이아몬드
card_val=(list(range(1,11)) + [10]*3)*4
base_names=['A']+list(range(2,11))+['J','K','Q']
cards=[]
for suit in ['H','S','C','D']:
    cards.extend(str(num) + suit for num in base_names)
    
deck=pd.Series(card_val, index=cards)

In [120]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [121]:
def draw(deck, n=5): # 5장의 카드 뽑기
    return deck.sample(n)

draw(deck)

9C    9
9S    9
AD    1
4S    4
8H    8
dtype: int64

In [122]:
# 각 세트 별로 2장의 카드를 무작위로 뽑고 싶다고 가정

get_suit=lambda card: card[-1] # 마지막 글자가 세트

In [123]:
deck.groupby(get_suit).apply(draw, n=2) 

C  5C     5
   AC     1
D  JD    10
   AD     1
H  6H     6
   5H     5
S  6S     6
   9S     9
dtype: int64

In [124]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2) 

8C     8
QC    10
6D     6
9D     9
5H     5
JH    10
4S     4
6S     6
dtype: int64

* 예제: 그룹 가중 평균과 상관관계

In [129]:
# groupby의 나누고 적용하고 합치는 패러다임에서 DF의 컬럼 간 연산이나 두 Series 간의 연산은 일상적인 일
# 그룹 키와 값 그리고 어떤 가중치를 갖는 데이터 묶음을 살펴보자

df=pd.DataFrame({'category':['a','a','a','a','b','b','b','b'],
                'data':np.random.randn(8),
                'weights':np.random.randn(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.386522,0.10223
1,a,0.209458,0.865827
2,a,-0.589541,0.464323
3,a,-1.321659,-1.123712
4,b,-0.77854,0.486872
5,b,-2.540201,-0.399019
6,b,0.36599,0.006149
7,b,-0.618029,-0.554968


In [130]:
grouped=df.groupby('category')

In [131]:
get_wavg=lambda g:np.average(g['data'], weights=g['weights']) 

In [132]:
grouped.apply(get_wavg) # category별 그룹 가중 평균

category
a    4.384219
b   -2.125482
dtype: float64

In [133]:
# 파이낸스에서 가져온 몇몇 주식과 S&P 500 지수(종목코드 SPX)의 종가 데이터 살펴보기

close_px=pd.read_csv('examples/stock_px_2.csv',
                    parse_dates=True,
                    index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [134]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [135]:
spx_corr=lambda x: x.corrwith(x['SPX']) # 우선 'SPX'컬럼과 다른 컬럼의 상관관계를 계산하는 함수를 만든다.

In [136]:
rets=close_px.pct_change().dropna() # pct_change함수를 이용해서 close_px의 퍼센트 변화율을 계산

In [137]:
get_year=lambda x:x.year # datetime에서 연도 속성만 반환하는 함수를 만든다.

In [138]:
by_year=rets.groupby(get_year) # 연도별 퍼센트 변화율을 구한다.

In [139]:
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [140]:
# 두 컬럼 간의 상관관계를 계산

by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

* 예제: 그룹상의 선형회귀

In [141]:
# 계량경제 라이브러리인 statsmodels를 사용하여 regress함수를 작성하고 각 데이터 묶음마다 최소제곱으로 회귀 수행

import statsmodels.api as sm

def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept']=1
    result=sm.OLS(Y,X).fit()
    return result.params

In [142]:
by_year.apply(regress,'AAPL',['SPX']) # SPX수익률에 대한 애플 주식의 연간 선형회귀

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


# 피벗테이블과 교차일람표 

: 피벗테이블은 스프레드시트 프로그램과 그 외 다른 데이터 분석 소프트웨어에서 볼 수 있는 데이터 요약화 도구

 데이터를 하나 이상의 키로 수집해서 어떤 키는 로우에, 어떤 키는 컬럼에 나열해서 데이터 정렬

In [145]:
# groupby를 이용하여 피벗테이블은 계층적 색인을 활용한 재형성 연산을 가능하게 해줌
# 요일(day)과 흡연자(smoker)집단에서 평균(pivot_table의 기본연산)을 구해보자

tips.pivot_table(index=['day','smoker']) # pivot_table메서드는 마진이라고 하는 부분합을 추가할 수 있는 기능 제공

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [147]:
tips.pivot_table(['tip_pct','size'], index=['time','day'], columns='smoker')
# tip_pct와 size에 대해 집계를 하고, 날짜(time)별로 그룹지어보기 위해 day로우와 smoker컬럼 추가

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [148]:
tips.pivot_table(['tip_pct','size'], index=['time','day'], columns='smoker',margins=True)
# margins=True를 넘겨 부분합을 포함하도록 확장, All컬럼과 All로우가 추가되어 단일 줄 안에 그룹 통계를 얻을 수 있음
# All 컬럼은 흡연자와 비흡연자를 구분하지 않은 평균값, All로우는 로우에서 두 단계를 묶은 그룹의 평균값

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [149]:
# 다른 집계함수를 사용하려면 aggfunc로 넘기면 되지만 'count'나 len함수는 그룹 크기의 교차일람표(총 개수나 빈도)를 반환

tips.pivot_table('tip_pct', index=['time','smoker'],columns='day',
                aggfunc=len, margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [150]:
tips.pivot_table('tip_pct', index=['time','size','smoker'],
                columns='day', aggfunc='mean', fill_value=0) # 어떤 조합이 비어있다면 fill_value를 넘길 수도 있다.

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,1,No,0.0,0.137931,0.0,0.0
Dinner,1,Yes,0.0,0.325733,0.0,0.0
Dinner,2,No,0.139622,0.162705,0.168859,0.159744
Dinner,2,Yes,0.171297,0.148668,0.207893,0.0
Dinner,3,No,0.0,0.154661,0.152663,0.0
Dinner,3,Yes,0.0,0.144995,0.15266,0.0
Dinner,4,No,0.0,0.150096,0.148143,0.0
Dinner,4,Yes,0.11775,0.124515,0.19337,0.0
Dinner,5,No,0.0,0.0,0.206928,0.0
Dinner,5,Yes,0.0,0.106572,0.06566,0.0


* 교차일람표: 그룹 빈도를 계산하기 위한 피벗테이블의 특수한 경우

In [155]:
data=pd.DataFrame({'sample':np.arange(1,11),
                  'Nationality': ['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
                  'Handeness':['R','L','R','R','L','R','R','L','R','R']}) # 'R' is Right-Handeness, 'L' is Left-Handeness.
data

Unnamed: 0,sample,Nationality,Handeness
0,1,USA,R
1,2,Japan,L
2,3,USA,R
3,4,Japan,R
4,5,Japan,L
5,6,Japan,R
6,7,USA,R
7,8,USA,L
8,9,Japan,R
9,10,USA,R


In [156]:
pd.crosstab(data.Nationality, data.Handeness, margins=True)
# 설문 분석의 일부로서 데이터를 국적과 잘 쓰는 손에 따라 요약하기 위해 pandas.crosstab함수 사용

Handeness,L,R,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [157]:
# crosstab함수의 처음 두 인자는 배열이나 Series 혹은 배열의 리스트가 될 수 있다

pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
