# **CH10** apply 메서드 활용

## 1. 간단한 함수 만들기

In [312]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [313]:
def my_sq(x):
    return x**2

In [314]:
def my_exp(x,n):
    return x ** n

In [315]:
my_sq(4)

16

In [316]:
my_exp(2,4)

16

## 2. apply 메서드 사용하기 - 기초

### 시리즈와 데이터프레임에 apply 메서드 사용하기

{}.apply(func) => 데이터 하나씩 요소별 출력 적용 = 브로드캐스팅 

In [317]:
df = pd.DataFrame({
    'a':[10,20,30]
    ,'b':[20,30,40]     
})

df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [318]:
# 브로드캐스팅 : 요소별 연산 <= ndarray의 기능
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [319]:
#Series.apply
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [320]:
df['a'].apply(my_exp, n=2)

0    100
1    400
2    900
Name: a, dtype: int64

In [321]:
df['a'].apply(my_exp,n=3)

0     1000
1     8000
2    27000
Name: a, dtype: int64

데이터프레임 apply 적용 축 방향 정하기

In [322]:
def print_me(x):
    print(x)

In [323]:
df.apply(print_me,axis=0)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [324]:
df.apply(print_me,axis=1)

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


0    None
1    None
2    None
dtype: object

In [325]:
# def avg_3_apply(col):
#     x = col[0]
#     y = col[1]
#     z = col[2]
#     return (x + y + z) / 3

def avg_3_apply(col):
    sum = 0
    for item in col:
        sum += item
    return sum / df.shape[1]

In [326]:
# 요소 = 시리즈
df.apply(avg_3_apply)

a    30.0
b    45.0
dtype: float64

In [327]:
# axis=1

def avg_2_apply(col):
    sum = 0
    for item in col:
        sum += item
    return sum / df.shape[1]

In [328]:
df.apply(avg_2_apply, axis=1)

0    15.0
1    25.0
2    35.0
dtype: float64

## 3. apply 메서드 사용하기 - 응용

### 데이터프레임의 누락값 처리하기 - 열 방향

In [329]:
titanic = sns.load_dataset('titanic')

In [330]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [331]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [332]:
# 누락값 개수 세기
def count_missing(vec):
    null_count = np.sum(pd.isnull(vec))
    return null_count

In [333]:
cmis_col = titanic.apply(count_missing, axis=0)
cmis_col

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [334]:
titanic.isnull().sum() / titanic.shape[0]

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [335]:
# 누락값의 비율
def prop_missing(vec):
    num = count_missing(vec)
    dem = vec.size
    return num / dem

In [336]:
pmis_col = titanic.apply(prop_missing)
pmis_col

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [337]:
# 누락값이 아닌 것의 비율
def prop_complete(vec):
    return 1 - prop_missing(vec)

### 데이터프레임의 누락값 처리하기 - 행 방향

In [338]:
titanic.apply(count_missing, axis=1)

0      1
1      0
2      1
3      0
4      1
      ..
886    1
887    0
888    2
889    0
890    1
Length: 891, dtype: int64

In [339]:
titanic.apply(prop_missing, axis=1)

0      0.066667
1      0.000000
2      0.066667
3      0.000000
4      0.066667
         ...   
886    0.066667
887    0.000000
888    0.133333
889    0.000000
890    0.066667
Length: 891, dtype: float64

In [340]:
titanic.apply(prop_complete, axis=1)

0      0.933333
1      1.000000
2      0.933333
3      1.000000
4      0.933333
         ...   
886    0.933333
887    1.000000
888    0.866667
889    1.000000
890    0.933333
Length: 891, dtype: float64

In [341]:
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,1


In [342]:
titanic.loc[titanic.num_missing > 1, :].sample(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
306,1,1,female,,0,0,110.8833,C,First,woman,False,,Cherbourg,yes,True,2
766,0,1,male,,0,0,39.6,C,First,man,True,,Cherbourg,no,True,2
295,0,1,male,,0,0,27.7208,C,First,man,True,,Cherbourg,no,True,2
324,0,3,male,,8,2,69.55,S,Third,man,True,,Southampton,no,False,2
19,1,3,female,,0,0,7.225,C,Third,woman,False,,Cherbourg,yes,True,2
354,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True,2
490,0,3,male,,1,0,19.9667,S,Third,man,True,,Southampton,no,False,2
563,0,3,male,,0,0,8.05,S,Third,man,True,,Southampton,no,True,2
517,0,3,male,,0,0,24.15,Q,Third,man,True,,Queenstown,no,True,2
629,0,3,male,,0,0,7.7333,Q,Third,man,True,,Queenstown,no,True,2


In [343]:
titanic.loc[titanic.num_missing == 0, :].sample(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
345,1,2,female,24.0,0,0,13.0,S,Second,woman,False,F,Southampton,yes,True,0
429,1,3,male,32.0,0,0,8.05,S,Third,man,True,E,Southampton,yes,True,0
741,0,1,male,36.0,1,0,78.85,S,First,man,True,C,Southampton,no,False,0
550,1,1,male,17.0,0,2,110.8833,C,First,man,True,C,Cherbourg,yes,False,0
587,1,1,male,60.0,1,1,79.2,C,First,man,True,B,Cherbourg,yes,False,0
291,1,1,female,19.0,1,0,91.0792,C,First,woman,False,B,Cherbourg,yes,False,0
717,1,2,female,27.0,0,0,10.5,S,Second,woman,False,E,Southampton,yes,True,0
487,0,1,male,58.0,0,0,29.7,C,First,man,True,B,Cherbourg,no,True,0
311,1,1,female,18.0,2,2,262.375,C,First,woman,False,B,Cherbourg,yes,False,0
867,0,1,male,31.0,0,0,50.4958,S,First,man,True,A,Southampton,no,True,0


In [344]:
# display option : 전체 행렬 표시

# 전체 컬럼 표시
#pd.set_option('display.max_columns', None)

# 전체 행 표시
#pd.set_option('display.max_rows',None)

---
# **CH11** 그룹 연산

분할 - 반영 - 결합 (split - apply - combine)

## 1. 데이터 집계

In [345]:
df = pd.read_csv('../data/gapminder.tsv',sep='\t')
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [346]:
# 연도별 평균 기대수명

#                      df.groupby(['year'])['lifeExp'].mean()
avg_life_exp_by_year = df.groupby('year').lifeExp.mean()
avg_life_exp_by_year

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

groupby 메서드의 상세 과정

In [347]:
# 분할
df.year.unique()

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007], dtype=int64)

In [384]:
# 반영

y1952_mean = df.loc[df.year == 1952, 'lifeExp'].mean()
# ...
y2007_mean = df.loc[df.year == 2007, 'lifeExp'].mean()
y2002_mean = df.loc[df.year == 2002]['lifeExp'].mean()
y2002_mean

65.69492253521126

In [349]:
for year in df.year.unique():
    print(df.loc[df.year==year,:].lifeExp.mean())

49.057619718309866
51.50740112676056
53.609249014084504
55.678289577464795
57.64738647887324
59.57015746478874
61.53319718309859
63.21261267605633
64.16033802816901
65.01467605633802
65.69492253521126
67.00742253521126


In [350]:
#df.loc[df.year==year, :].lifeExp.mean()

In [351]:
# 결합
df2 = pd.DataFrame({
    'year':[1952,2007]
    ,'lifeExp_mean':[y1952_mean, y2007_mean]
})
df2

Unnamed: 0,year,lifeExp_mean
0,1952,49.05762
1,2007,67.007423


- size : 누락값 포함 데이터 수
- count : 누락값 제외 데이터 수

agg 메서드로 사용자 함수와 groupby 메서드 조합하기

In [352]:
# 사용자 함수
def my_mean(values):
    n = len(values)
    sum = 0
    for value in values:
        sum  += value
    return sum / n

In [353]:
df.groupby('year').lifeExp.agg(my_mean)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

2개 인잣값을 받아 처리하는 사용자 함수와 groupby 메서드

In [354]:
def my_mean_diff(values, diff_value):
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    mean = sum / n
    return mean - diff_value

여러개의 집계 메서드 한 번에 사용하기

In [355]:
df.groupby('year').lifeExp.agg([np.count_nonzero, np.mean, np.std])

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142,49.05762,12.225956
1957,142,51.507401,12.231286
1962,142,53.609249,12.097245
1967,142,55.67829,11.718858
1972,142,57.647386,11.381953
1977,142,59.570157,11.227229
1982,142,61.533197,10.770618
1987,142,63.212613,10.556285
1992,142,64.160338,11.22738
1997,142,65.014676,11.559439


In [356]:
df.groupby('year').agg({'lifeExp'  : 'mean',
                        'pop'      : 'median',
                        'gdpPercap': 'median'
})

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,3943953.0,1968.528344
1957,51.507401,4282942.0,2173.220291
1962,53.609249,4686039.5,2335.439533
1967,55.67829,5170175.5,2678.33474
1972,57.647386,5877996.5,3339.129407
1977,59.570157,6404036.5,3798.609244
1982,61.533197,7007320.0,4216.228428
1987,63.212613,7774861.5,4280.300366
1992,64.160338,8688686.5,4386.085502
1997,65.014676,9735063.5,4781.825478


## 2. 데이터 변환

### 표준 점수 계산하기 : 표준화

표준점수 : (x1 - 평균) / 표준편차   : Z-score -> 평균:0, 표준편차:1

In [357]:
def my_zscore(x):
    return (x - x.mean()) / x.std()

In [358]:
transform_z = df.groupby('year').lifeExp.transform(my_zscore)
transform_z.head()

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64

In [359]:
df.groupby('year').lifeExp.apply(my_zscore)

year      
1952  0      -1.656854
      12      0.504859
      24     -0.489174
      36     -1.557557
      48      1.098268
                ...   
2007  1655    0.599815
      1667    0.531315
      1679   -0.356947
      1691   -2.039541
      1703   -1.948180
Name: lifeExp, Length: 1704, dtype: float64

In [360]:
df.shape

(1704, 6)

In [361]:
transform_z.shape

(1704,)

누락값을 평균값으로 처리하기

In [362]:
np.random.seed(42)
tips_10 = sns.load_dataset('tips').sample(10)
tips_10


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [363]:
np.random.permutation([1,2,3,4,5,6,7,8])[:5]

array([6, 4, 5, 3, 7])

In [364]:
tips_10.loc[np.random.permutation(tips_10.index)[:4],'total_bill'] = np.NaN
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,,5.16,Male,Yes,Sat,Dinner,4
198,,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [365]:
tips_10.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,5,7,7,7,7,7
Female,1,3,3,3,3,3


In [366]:
def fill_na_mean(x):
    avg = x.mean()
    return x.fillna(avg)

In [367]:
tips_10['fill_total_bill'] = tips_10.groupby('sex').total_bill.transform(fill_na_mean)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fill_total_bill
24,,3.18,Male,No,Sat,Dinner,2,18.886
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
153,24.55,2.0,Male,No,Sun,Dinner,4,24.55
211,,5.16,Male,Yes,Sat,Dinner,4,18.886
198,,2.0,Female,Yes,Thur,Lunch,2,15.38
176,17.89,2.0,Male,Yes,Sun,Dinner,2,17.89
192,28.44,2.56,Male,Yes,Thur,Lunch,2,28.44
124,,2.52,Female,No,Thur,Lunch,2,15.38
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78
101,15.38,3.0,Female,Yes,Fri,Dinner,2,15.38


In [368]:
tips = sns.load_dataset('tips')
tips.groupby('sex').total_bill.mean()

sex
Male      20.744076
Female    18.056897
Name: total_bill, dtype: float64

## 3. 데이터 필터링

In [369]:
tips.shape

(244, 7)

In [370]:
tips['size'].value_counts()

size
2    156
3     38
4     37
5      5
1      4
6      4
Name: count, dtype: int64

In [371]:
# \ : 다음줄
tips_filtered = tips.\
        groupby('size').\
        filter(lambda x:x['size'].count() >= 30)

In [372]:
tips_filtered.shape

(231, 7)

In [373]:
tips_filtered['size'].value_counts()

size
2    156
3     38
4     37
Name: count, dtype: int64

## 4. 그룹 오브젝트

In [374]:
tips_10 = sns.load_dataset('tips').sample(10, random_state=42)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [375]:
grouped = tips_10.groupby('sex')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000174D6F6A4A0>

In [376]:
grouped.groups

{'Male': [24, 6, 153, 211, 176, 192, 9], 'Female': [198, 124, 101]}

한번에 그룹 오브젝트 계산하기

In [377]:
grouped.get_group('Male').total_bill.mean()

20.02

In [378]:
grouped[['total_bill','tip','size']].mean()

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.02,2.875714,2.571429
Female,13.62,2.506667,2.0


In [379]:
grouped.get_group('Male')[['total_bill','tip','size']].mean()

total_bill    20.020000
tip            2.875714
size           2.571429
dtype: float64

In [380]:
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [381]:
# 평균

avgs = grouped.mean()
avgs

TypeError: 'Categorical' with dtype category does not support reduction 'mean'

In [None]:
tips_10.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

### 그룹오브젝트 활용하기

In [None]:
female = grouped.get_group('Female')
female

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
198,13.0,2.0,Female,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [None]:
for sex_group in grouped:
    print(type(sex_group))
    print(len(sex_group))

    first_elelment = sex_group[0]

    print(type(sex_group[0]))
    print(len(sex_group[1]))

<class 'tuple'>
2
<class 'str'>
7
<class 'tuple'>
2
<class 'str'>
3
