### 4.2.4. 고급 데이터 처리
#### 4.2.4.1. 데이터프레임 집단별 연산

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
tips = sns.load_dataset('tips')

In [8]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
# 요일별 평균 식사 금액
tips.groupby('day')['total_bill'].mean()

day
Thur    17.682742
Fri     17.151579
Sat     20.441379
Sun     21.410000
Name: total_bill, dtype: float64

In [6]:
# 요일별 평균 식사 금액
# observed = Fals: 모든 범주 포함, 실제 데이터가 없으면 NaN
tips.groupby('day', observed = False)['total_bill'].mean()

day
Thur    17.682742
Fri     17.151579
Sat     20.441379
Sun     21.410000
Name: total_bill, dtype: float64

In [9]:
# 식사 시간별 식사 금액, 금액의 평균 및 표준편차
tips.groupby('time', observed = False)[['total_bill', 'tip']].agg(['mean', 'std'])

Unnamed: 0_level_0,total_bill,total_bill,tip,tip
Unnamed: 0_level_1,mean,std,mean,std
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Lunch,17.168676,7.713882,2.728088,1.205345
Dinner,20.797159,9.142029,3.10267,1.436243


In [11]:
# 식사 시간별 식사 금액의 최대값, 팁 금액의 최소값: 딕셔너리 사용
# .reset_index() 하면 결과 더 예쁘게 나옴
tips.groupby('time', observed = False).agg({'total_bill': 'max', 'tip': 'min'}).reset_index()

Unnamed: 0,time,total_bill,tip
0,Lunch,43.11,1.25
1,Dinner,50.81,1.0


In [13]:
# 식사 시간별 식사 금액의 최대값, 팁 금액의 최소값: 튜플 사용, named aggregation 문법
tips.groupby('time', observed=False).agg(total_bill_max = ('total_bill', 'max'), tip_min = ('tip', 'min')).reset_index()

Unnamed: 0,time,total_bill_max,tip_min
0,Lunch,43.11,1.25
1,Dinner,50.81,1.0


In [14]:
# 람다함수
(lambda x, y : x + y)(1, 4)

5

In [17]:
# 팁 금액 평균이 3달러 이상인 요일 데이터 필터링
# 팁 금액 평균이 3달러 이상이면 True 반환
def filter_tip_avg(x):
    return x['tip'].mean() >= 3

tips.groupby('day', observed = False).filter(filter_tip_avg)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
186,20.90,3.50,Female,Yes,Sun,Dinner,3
187,30.46,2.00,Male,Yes,Sun,Dinner,5
188,18.15,3.50,Female,Yes,Sun,Dinner,3
189,23.10,4.00,Male,Yes,Sun,Dinner,3


In [19]:
# 람다 함수 사용
tips.groupby('day', observed = False).filter(lambda x: x['tip'].mean() >= 3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
186,20.90,3.50,Female,Yes,Sun,Dinner,3
187,30.46,2.00,Male,Yes,Sun,Dinner,5
188,18.15,3.50,Female,Yes,Sun,Dinner,3
189,23.10,4.00,Male,Yes,Sun,Dinner,3


In [24]:
# 성별에 따른 팁 금액 편차(데이터 - 평균)

tips.groupby('sex', observed = False)['tip'].transform(lambda x: x - x.mean())

0     -1.823448
1     -1.429618
2      0.410382
3      0.220382
4      0.776552
         ...   
239    2.830382
240   -0.833448
241   -1.089618
242   -1.339618
243    0.166552
Name: tip, Length: 244, dtype: float64

In [31]:
tips[['sex', 'tip', 'tip_dev']].head()

KeyError: "['tip_dev'] not in index"

In [29]:
# 식사 시간별 팁 금액의 사분위수 범위

tips.groupby('time', observed = False)['tip'].apply(lambda x: x.quantile(0.25))

time
Lunch     2.0
Dinner    2.0
Name: tip, dtype: float64

#### 4.2.4.2. 수식 및 조건식 기반 데이터 처리

In [33]:
# 기존 열 연산: 식사 금액에서 세금 10%를 제외한 실수령 금액
# tips['total_bill'] = tips['total_bill'] * 0.9
# eval()은 기존 데이터 프레임을 변경하지 않음
tips.eval('total_bill = total_bill * 0.9')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,15.291,1.01,Female,No,Sun,Dinner,2
1,9.306,1.66,Male,No,Sun,Dinner,3
2,18.909,3.50,Male,No,Sun,Dinner,3
3,21.312,3.31,Male,No,Sun,Dinner,2
4,22.131,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,26.127,5.92,Male,No,Sat,Dinner,3
240,24.462,2.00,Female,Yes,Sat,Dinner,2
241,20.403,2.00,Male,Yes,Sat,Dinner,2
242,16.038,1.75,Male,No,Sat,Dinner,2


In [36]:
# 새로운 열 생성: 전체 식사 금액 대비 팁 금액 비율
# inplace = True : 기존 데이터프레임에 직접 적용(새 객체를 반환하지 않음)
tips.eval('tip_rate = tip / total_bill', inplace = True)

In [37]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [47]:
# 외부 변수 사용: 전체 식사 금액에 따른 기본(최소) 팁 금액
tip_percentage =  0.15
tips.eval('min_tip = total_bill * @tip_percentage')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate,min_tip
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,2.5485
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,1.5510
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587,3.1515
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780,3.5520
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,3.6885
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927,4.3545
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584,4.0770
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222,3.4005
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204,2.6730


In [48]:
# 기본(최소) 팁 금액보다 적게 준 경우 'Below', 많거나 같은 경우 'Above'로 구분
# 기본(최소) 팁 금액보다 적게 준 경우는 108팀임
tips['tip_group'] = np.where(tips['tip'] < tips['min_tip'], 'Below', 'Above')
print(tips['tip_group'].value_counts())

KeyError: 'min_tip'