- 출처 : https://rfriend.tistory.com/383?category=675917
https://rfriend.tistory.com/384?category=675917

- data : 전복에 대한 공개데이터
- 실습 : group_by()

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/abalone.txt',
                  sep = ',',
                  names = ['sex','length','diameter','height', 
                           'whole_weight', 'shucked_weight',
                           'viscera_weight', 'shell_weight', 'rings'], 
                header = None
)

In [3]:
data.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
(pd.isnull(data)).sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

## 1. groupby()
### Example 1

In [13]:
# 1. 성(sex)별 전복 전체 무게(whole_weight)
grouped = data['whole_weight'].groupby(data['sex'])

print(grouped.size())
print(grouped.sum())
print(grouped.mean())

SyntaxError: invalid syntax (<ipython-input-13-be4e5dc31d0d>, line 2)

In [14]:
# 함수랑 같이쓰면(mean) 아래오 같이 컬럼명만 지정해도 ok
data.groupby('sex').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [25]:
# 2. 성(sex)별, 길이 범주(length_cat)별 집계
# 중앙 값보다 큰 경우, 작은 경우 새로운 변수 생성 (np.where --> like mutate @R)
data['length_cat'] = np.where(data.length > np.median(data.length), 
                                 'length_long', # True
                                 'length_short') # False

print(data.length.median())
data[['length','length_cat']][:10]

0.545


Unnamed: 0,length,length_cat
0,0.455,length_short
1,0.35,length_short
2,0.53,length_short
3,0.44,length_short
4,0.33,length_short
5,0.425,length_short
6,0.53,length_short
7,0.545,length_short
8,0.475,length_short
9,0.55,length_long


In [26]:
data.groupby(['sex','length_cat'])['whole_weight'].mean()

sex  length_cat  
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: whole_weight, dtype: float64

In [35]:
# 3. 성(sex)를 key로 하는 dict 데이터 형태 만들기
# 컬럼은 'sex','length_cat','whole_weight','rings' 만 사용
data_sex_group = dict(list(data[['sex','length_cat','whole_weight','rings']].groupby('sex')))

data_sex_group['M'][:5]

# data[data['sex'] == 'M'] 와 동일한 결과. but dict로 만들어 놓으면 데이터셋을 indexing 하는 속도가 더 빠름

Unnamed: 0,sex,length_cat,whole_weight,rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9
11,M,length_short,0.406,10


### Example 2
- https://rfriend.tistory.com/397

In [4]:
import pandas as pd

In [6]:
df = pd.DataFrame(
    {'grp_col' : ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], 
     'val' : np.arange(10)+1,                   
    'weight' : [0.0, 0.1, 0.2, 0.3, 0.4, 0.0, 0.1, 0.2, 0.3, 0.4]})

df

Unnamed: 0,grp_col,val,weight
0,a,1,0.0
1,a,2,0.1
2,a,3,0.2
3,a,4,0.3
4,a,5,0.4
5,b,6,0.0
6,b,7,0.1
7,b,8,0.2
8,b,9,0.3
9,b,10,0.4


In [8]:
# 1. 그룹 별로 가중 평균 구하기
# Sol 1) groupby로 한 번에 처리
grouped = df.groupby('grp_col')
weighted_avg_func = lambda x:np.average(x['val'], weights = x['weight'])

grouped.apply(weighted_avg_func)

grp_col
a    4.0
b    9.0
dtype: float64

In [13]:
# Sol 2) Split -> Apply -> Combine
# Split
df_a = df[df['grp_col'] == 'a']
df_b = df[df['grp_col'] == 'b']

# apply
weighted_avg_a = sum((df_a['val']*df_a['weight']))/sum(df_a['weight'])
weighted_avg_b = sum((df_b['val']*df_b['weight']))/sum(df_b['weight'])

print(weighted_avg_a, weighted_avg_b)

# combine
pd.DataFrame({
    'grp_col' : ['a','b'],
    'weighted_average' : [weighted_avg_a,weighted_avg_b]
})

4.0 9.0


Unnamed: 0,grp_col,weighted_average
0,a,4.0
1,b,9.0
