In [41]:
import pandas as pd
import numpy as np

In [42]:
reserve_tb = pd.read_csv('./data/reserve.csv', encoding='UTF-8')

## 3. 집약
데이터의 손실을 최소화하여 압축해 처리함
- group_by를 사용함
- window 함수를 사용함

### 3.1 데이터 종류와 개수 산출

In [30]:
# 호텔별 예약건수, 고객수

rsv_cnt_tb = reserve_tb.groupby('hotel_id').size().reset_index()
rsv_cnt_tb.columns = ['hotel_id', 'res_cnt']
rsv_cnt_tb

cus_cnt_tb = reserve_tb.groupby('hotel_id')['customer_id'].nunique().reset_index()
cus_cnt_tb.columns = ['hotel_id', 'cus_cnt']
cus_cnt_tb

pd.merge(rsv_cnt_tb, cus_cnt_tb, on='hotel_id')

Unnamed: 0,hotel_id,res_cnt,cus_cnt
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13
...,...,...,...
295,h_95,13,13
296,h_96,13,13
297,h_97,16,16
298,h_98,17,16


In [31]:
# agg 사용
result = reserve_tb.groupby('hotel_id').agg({'reserve_id': 'count', 'customer_id': 'nunique'})
result.reset_index(inplace=True)
result.columns = ['hotel_id', 'rsv_cnt', 'cus_cnt']
result

Unnamed: 0,hotel_id,rsv_cnt,cus_cnt
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13
...,...,...,...
295,h_95,13,13
296,h_96,13,13
297,h_97,16,16
298,h_98,17,16


### 3.2. 합계 계산

In [38]:
result = reserve_tb.groupby(['hotel_id', 'people_num'])['total_price'].sum().reset_index()
result.rename(columns={'total_price': 'price_sum'}, inplace=True)
result

Unnamed: 0,hotel_id,people_num,price_sum
0,h_1,1,156600
1,h_1,2,156600
2,h_1,3,391500
3,h_1,4,417600
4,h_10,1,11200
...,...,...,...
1154,h_98,3,793800
1155,h_98,4,453600
1156,h_99,1,179200
1157,h_99,2,448000


### 3.3. 최대 최소 대표 값 선출

In [43]:
result = reserve_tb \
    .groupby('hotel_id') \
    .agg({
        'total_price': [
            'max',
            'min',
            'mean',
            'median',
            lambda x: np.percentile(x, q=20)
        ]
    }) \
    .reset_index()

result.columns = [
    'hotel_id',
    'price_max',
    'price_min',
    'price_mean',
    'price_median',
    'price_20per'
]
result

Unnamed: 0,hotel_id,price_max,price_min,price_mean,price_median,price_20per
0,h_1,208800,26100,112230.000000,104400,73080
1,h_10,67200,11200,42933.333333,50400,26880
2,h_100,57600,4800,27600.000000,28800,9600
3,h_101,168000,14000,75764.705882,56000,30800
4,h_102,72000,12000,32769.230769,24000,18000
...,...,...,...,...,...,...
295,h_95,518400,43200,275815.384615,259200,146880
296,h_96,66600,7400,33015.384615,29600,17760
297,h_97,250800,20900,83600.000000,62700,20900
298,h_98,226800,18900,96723.529412,75600,56700


### 3.4. 분포 계산

In [45]:
# var: 분산
# std: 표준편차
result = reserve_tb \
    .groupby('hotel_id') \
    .agg({
        'total_price': ['var', 'std']
    }) \
    .reset_index()
result.columns = ['hotel_id', 'price_var', 'price_std']

result.fillna(0, inplace=True)
result

Unnamed: 0,hotel_id,price_var,price_std
0,h_1,3.186549e+09,56449.526127
1,h_10,8.258133e+08,28736.968061
2,h_100,3.198316e+08,17883.835689
3,h_101,2.402441e+09,49014.703676
4,h_102,3.576923e+08,18912.755159
...,...,...,...
295,h_95,3.313772e+10,182037.696857
296,h_96,3.159231e+08,17774.225072
297,h_97,5.474685e+09,73991.116584
298,h_98,3.432893e+09,58590.896578


### 3.5. 최빈값

In [53]:
reserve_tb['total_price'].round(-3).mode()

0    10000
1    20000
2    40000
dtype: int64

### 3.6. 순위 계산

In [70]:
reserve_tb['reserve_datetime'] = pd.to_datetime(
    reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'
)

reserve_tb['log_no'] = reserve_tb \
    .groupby('customer_id')['reserve_datetime'] \
    .rank(ascending=True, method='first')

reserve_tb[['customer_id', 'reserve_datetime', 'log_no']]

Unnamed: 0,customer_id,reserve_datetime,log_no
0,c_1,2016-03-06 13:09:42,1.0
1,c_1,2016-07-16 23:39:55,2.0
2,c_1,2016-09-24 10:03:17,3.0
3,c_1,2017-03-08 03:20:10,4.0
4,c_1,2017-09-05 19:50:37,5.0
...,...,...,...
4025,c_999,2017-06-27 23:00:02,4.0
4026,c_999,2017-09-29 05:24:57,5.0
4027,c_999,2018-03-14 05:01:45,6.0
4028,c_1000,2016-04-16 15:20:17,1.0


In [74]:
rsv_cnt_tb = reserve_tb.groupby('hotel_id').size().reset_index()
rsv_cnt_tb.columns = ['hotel_id', 'rsv_cnt']

rsv_cnt_tb['rsv_cnt_rank'] = rsv_cnt_tb['rsv_cnt'].rank(ascending=False, method='min')

rsv_cnt_tb.drop('rsv_cnt', axis=1, inplace=True)
rsv_cnt_tb

Unnamed: 0,hotel_id,rsv_cnt_rank
0,h_1,235.0
1,h_10,300.0
2,h_100,12.0
3,h_101,43.0
4,h_102,139.0
...,...,...
295,h_95,139.0
296,h_96,139.0
297,h_97,60.0
298,h_98,43.0
