In [41]:
import pandas as pd
import numpy as np

In [42]:
reserve_tb = pd.read_csv('./data/reserve.csv', encoding='UTF-8')

## 2. 추출
### 2.1. 데이터 열을 지정한 추출

필요한 열만 지정해서 추출

- 이름 등의 고유 문자열은 필요가 없을 수 있음

In [6]:
# 필요한 열만 지정
reserve_tb[[
    'reserve_id',
    'hotel_id',
    'customer_id',
    'reserve_datetime',
    'checkin_date',
    'checkin_time',
    'checkout_date'
]]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23
...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


In [7]:
# 불필요한 열을 제거
# axis=1: 열을 제거함을 지정
# inplace=True: reserve_tb를 직접 갱신함
reserve_tb.drop(['people_num', 'total_price'], axis=1, inplace=True)

In [8]:
reserve_tb

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23
...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


### 2.2. 조건에 따른 데이터 행 추출
인덱싱이 가능하다면 인덱싱을 해두어라 (RDB)

In [11]:
# 가독성이 좋지 않다.
# 코드 반복이 많다.
reserve_tb[
    (reserve_tb['checkout_date'] >= '2016-10-13') &
    (reserve_tb['checkout_date'] <= '2016-10-14')
]

# 가독성이 좋지 않다.
# 코드 반복이 많다.
reserve_tb.loc[
    (reserve_tb['checkout_date'] >= '2016-10-13') &
    (reserve_tb['checkout_date'] <= '2016-10-14'),
    :
]

# good
reserve_tb.query('"2016-10-13" <= checkout_date <= "2016-10-14"')

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
284,r285,h_121,c_67,2016-09-27 06:13:19,2016-10-12,12:00:00,2016-10-14
513,r514,h_74,c_120,2016-10-06 03:12:04,2016-10-11,12:30:00,2016-10-14
1065,r1066,h_205,c_261,2016-09-14 02:57:59,2016-10-11,10:00:00,2016-10-14
1480,r1481,h_116,c_364,2016-09-17 17:45:39,2016-10-11,11:30:00,2016-10-13
1546,r1547,h_149,c_377,2016-09-27 08:19:24,2016-10-10,11:00:00,2016-10-13
1709,r1710,h_59,c_422,2016-09-19 04:17:25,2016-10-10,12:00:00,2016-10-13
1932,r1933,h_113,c_477,2016-09-24 09:04:26,2016-10-12,11:30:00,2016-10-13
2058,r2059,h_9,c_517,2016-09-19 15:32:35,2016-10-11,12:30:00,2016-10-13
2115,r2116,h_77,c_527,2016-10-05 00:44:09,2016-10-11,09:00:00,2016-10-13
2170,r2171,h_177,c_540,2016-09-28 01:21:26,2016-10-11,10:00:00,2016-10-13


### 2.3. 데이터 값을 고려하지 않는 샘플링

In [14]:
# 랜덤 샘플링

reserve_tb.sample(frac=0.5)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
3763,r3764,h_73,c_937,2016-04-15 14:54:07,2016-04-26,10:30:00,2016-04-27
3149,r3150,h_74,c_794,2017-05-17 15:44:02,2017-06-01,09:00:00,2017-06-03
146,r147,h_175,c_31,2016-01-16 11:31:19,2016-02-14,11:30:00,2016-02-17
2064,r2065,h_54,c_518,2016-08-02 21:13:19,2016-08-04,09:30:00,2016-08-06
1874,r1875,h_207,c_464,2016-08-16 22:56:58,2016-08-20,12:00:00,2016-08-22
...,...,...,...,...,...,...,...
3400,r3401,h_234,c_849,2016-06-05 22:14:49,2016-06-28,12:00:00,2016-07-01
778,r779,h_271,c_190,2017-05-12 15:14:16,2017-06-10,09:30:00,2017-06-12
2452,r2453,h_95,c_613,2016-06-27 18:48:14,2016-07-16,11:30:00,2016-07-19
3257,r3258,h_20,c_820,2016-12-05 01:00:16,2016-12-09,12:30:00,2016-12-11


### 2.4 집약 ID 기반 샘플링

In [17]:
# 유니크한 customer_id 목록을 가져옴
target = pd.Series(reserve_tb['customer_id'].unique()).sample(frac=0.5)
# 위에서 가져온 customer_id의 reserve_tb 레코드만 가저옴
reserve_tb[reserve_tb['customer_id'].isin(target)]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
24,r25,h_277,c_4,2016-03-28 07:17:34,2016-04-07,10:30:00,2016-04-10
25,r26,h_132,c_4,2016-05-11 17:48:07,2016-06-05,11:30:00,2016-06-06
26,r27,h_97,c_4,2016-07-08 14:10:06,2016-07-16,10:30:00,2016-07-17
27,r28,h_119,c_4,2016-10-07 04:38:54,2016-11-04,10:00:00,2016-11-06
28,r29,h_222,c_4,2016-11-10 21:59:02,2016-11-13,12:30:00,2016-11-16
...,...,...,...,...,...,...,...
4010,r4011,h_137,c_995,2017-08-13 19:23:48,2017-09-07,09:30:00,2017-09-09
4011,r4012,h_54,c_995,2018-03-08 03:54:07,2018-03-20,10:30:00,2018-03-22
4012,r4013,h_13,c_995,2018-07-04 18:54:55,2018-07-04,11:00:00,2018-07-05
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


## 3. 집약
데이터의 손실을 최소화하여 압축해 처리함
- group_by를 사용함
- window 함수를 사용함

### 3.1 데이터 종류와 개수 산출

In [30]:
# 호텔별 예약건수, 고객수

rsv_cnt_tb = reserve_tb.groupby('hotel_id').size().reset_index()
rsv_cnt_tb.columns = ['hotel_id', 'res_cnt']
rsv_cnt_tb

cus_cnt_tb = reserve_tb.groupby('hotel_id')['customer_id'].nunique().reset_index()
cus_cnt_tb.columns = ['hotel_id', 'cus_cnt']
cus_cnt_tb

pd.merge(rsv_cnt_tb, cus_cnt_tb, on='hotel_id')

Unnamed: 0,hotel_id,res_cnt,cus_cnt
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13
...,...,...,...
295,h_95,13,13
296,h_96,13,13
297,h_97,16,16
298,h_98,17,16


In [31]:
# agg 사용
result = reserve_tb.groupby('hotel_id').agg({'reserve_id': 'count', 'customer_id': 'nunique'})
result.reset_index(inplace=True)
result.columns = ['hotel_id', 'rsv_cnt', 'cus_cnt']
result

Unnamed: 0,hotel_id,rsv_cnt,cus_cnt
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13
...,...,...,...
295,h_95,13,13
296,h_96,13,13
297,h_97,16,16
298,h_98,17,16


### 3.2. 합계 계산

In [38]:
result = reserve_tb.groupby(['hotel_id', 'people_num'])['total_price'].sum().reset_index()
result.rename(columns={'total_price': 'price_sum'}, inplace=True)
result

Unnamed: 0,hotel_id,people_num,price_sum
0,h_1,1,156600
1,h_1,2,156600
2,h_1,3,391500
3,h_1,4,417600
4,h_10,1,11200
...,...,...,...
1154,h_98,3,793800
1155,h_98,4,453600
1156,h_99,1,179200
1157,h_99,2,448000


### 3.3. 최대 최소 대표 값 선출

In [43]:
result = reserve_tb \
    .groupby('hotel_id') \
    .agg({
        'total_price': [
            'max',
            'min',
            'mean',
            'median',
            lambda x: np.percentile(x, q=20)
        ]
    }) \
    .reset_index()

result.columns = [
    'hotel_id',
    'price_max',
    'price_min',
    'price_mean',
    'price_median',
    'price_20per'
]
result

Unnamed: 0,hotel_id,price_max,price_min,price_mean,price_median,price_20per
0,h_1,208800,26100,112230.000000,104400,73080
1,h_10,67200,11200,42933.333333,50400,26880
2,h_100,57600,4800,27600.000000,28800,9600
3,h_101,168000,14000,75764.705882,56000,30800
4,h_102,72000,12000,32769.230769,24000,18000
...,...,...,...,...,...,...
295,h_95,518400,43200,275815.384615,259200,146880
296,h_96,66600,7400,33015.384615,29600,17760
297,h_97,250800,20900,83600.000000,62700,20900
298,h_98,226800,18900,96723.529412,75600,56700


### 3.4. 분포 계산

In [45]:
# var: 분산
# std: 표준편차
result = reserve_tb \
    .groupby('hotel_id') \
    .agg({
        'total_price': ['var', 'std']
    }) \
    .reset_index()
result.columns = ['hotel_id', 'price_var', 'price_std']

result.fillna(0, inplace=True)
result

Unnamed: 0,hotel_id,price_var,price_std
0,h_1,3.186549e+09,56449.526127
1,h_10,8.258133e+08,28736.968061
2,h_100,3.198316e+08,17883.835689
3,h_101,2.402441e+09,49014.703676
4,h_102,3.576923e+08,18912.755159
...,...,...,...
295,h_95,3.313772e+10,182037.696857
296,h_96,3.159231e+08,17774.225072
297,h_97,5.474685e+09,73991.116584
298,h_98,3.432893e+09,58590.896578


### 3.5. 최빈값

In [53]:
reserve_tb['total_price'].round(-3).mode()

0    10000
1    20000
2    40000
dtype: int64

### 3.6. 순위 계산

In [70]:
reserve_tb['reserve_datetime'] = pd.to_datetime(
    reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'
)

reserve_tb['log_no'] = reserve_tb \
    .groupby('customer_id')['reserve_datetime'] \
    .rank(ascending=True, method='first')

reserve_tb[['customer_id', 'reserve_datetime', 'log_no']]

Unnamed: 0,customer_id,reserve_datetime,log_no
0,c_1,2016-03-06 13:09:42,1.0
1,c_1,2016-07-16 23:39:55,2.0
2,c_1,2016-09-24 10:03:17,3.0
3,c_1,2017-03-08 03:20:10,4.0
4,c_1,2017-09-05 19:50:37,5.0
...,...,...,...
4025,c_999,2017-06-27 23:00:02,4.0
4026,c_999,2017-09-29 05:24:57,5.0
4027,c_999,2018-03-14 05:01:45,6.0
4028,c_1000,2016-04-16 15:20:17,1.0


In [74]:
rsv_cnt_tb = reserve_tb.groupby('hotel_id').size().reset_index()
rsv_cnt_tb.columns = ['hotel_id', 'rsv_cnt']

rsv_cnt_tb['rsv_cnt_rank'] = rsv_cnt_tb['rsv_cnt'].rank(ascending=False, method='min')

rsv_cnt_tb.drop('rsv_cnt', axis=1, inplace=True)
rsv_cnt_tb

Unnamed: 0,hotel_id,rsv_cnt_rank
0,h_1,235.0
1,h_10,300.0
2,h_100,12.0
3,h_101,43.0
4,h_102,139.0
...,...,...
295,h_95,139.0
296,h_96,139.0
297,h_97,60.0
298,h_98,43.0
