# Import Libraries

In [1]:
# stat
import pandas as pd
import numpy as np
import itertools

#visualise
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# data 
import datetime
import json 

# etc
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [31]:
train = pd.read_csv("../data/00/2019sales.csv", skiprows = 1)

In [32]:
train.rename(columns={' 취급액 ': '취급액'}, inplace = True)
train['exposed']  = train['노출(분)']

In [33]:
train

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed
0,2019/01/01 6:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000,20.0
1,2019/01/01 6:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000,
2,2019/01/01 6:20,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000,20.0
3,2019/01/01 6:20,,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000,
4,2019/01/01 6:40,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000,20.0
...,...,...,...,...,...,...,...,...,...
38304,2020/01/01 0:20,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,-,,20.0
38305,2020/01/01 0:40,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,-,,20.0
38306,2020/01/01 1:00,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,-,,20.0
38307,2020/01/01 1:20,20.0,100490,201478,더케이 예다함 상조서비스(티포트),무형,-,,20.0


In [34]:
# define data types
train.마더코드 = train.마더코드.astype(int).astype(str).str.zfill(6)
train.상품코드 = train.상품코드.astype(int).astype(str).str.zfill(6)

In [35]:
train.취급액 = train.취급액.str.replace(",","").astype(float)
train.판매단가 = train.판매단가.str.replace(",","").replace(' - ', np.nan).astype(float)

In [36]:
train.방송일시 = pd.to_datetime(train.방송일시, format="%Y/%m/%d %H:%M")

In [37]:
train.sort_values(['방송일시', '상품코드'], ascending=[True, True], inplace = True)

In [38]:
train.head(15)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,2099000.0,20.0
1,2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,4371000.0,
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,3262000.0,20.0
3,2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,6955000.0,
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,6672000.0,20.0
5,2019-01-01 06:40:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,9337000.0,
6,2019-01-01 07:00:00,20.0,100305,200974,오모떼 레이스 파운데이션 브라,속옷,59000.0,6819000.0,20.0
7,2019-01-01 07:20:00,20.0,100305,200974,오모떼 레이스 파운데이션 브라,속옷,59000.0,15689000.0,20.0
8,2019-01-01 07:40:00,20.0,100305,200974,오모떼 레이스 파운데이션 브라,속옷,59000.0,25370000.0,20.0
9,2019-01-01 08:00:00,20.0,100808,202377,CERINI by PAT 남성 소프트 기모 릴렉스팬츠,의류,59900.0,16133000.0,20.0


In [39]:
train.shape

(38309, 9)

# japping time

In [40]:
def filter_jappingt(x):
    """
    :objective: round up 방송일시
    :param x: train row - pd.Dataframe
    :return: int
    """
    time = x['방송일시']
    if (time.minute < 30) & (time.hour == 0):
        rtn = time.hour
    elif time.minute >= 30:
        if time.hour == 23: rtn = 0
        else: rtn = time.hour + 1
    else:
        if time.hour == 0: rtn = 23
        else: rtn = time.hour
    return rtn

In [41]:
train['japp'] = train.apply(filter_jappingt, axis=1)

In [42]:
train.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,2099000.0,20.0,6
1,2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,4371000.0,,6
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,3262000.0,20.0,6
3,2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,6955000.0,,6
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,6672000.0,20.0,7


In [43]:
train.shape

(38309, 10)

# exposed

fill NaN values

In [44]:
train.exposed.isna().sum()

16784

In [45]:
train.exposed.unique()

array([20., nan, 30., 17., 27., 16., 26., 60., 12.,  2., 15., 25.,  3.,
       14., 22., 10., 13., 23., 18.,  5.,  9., 19.,  7., 11., 40.])

In [46]:
# fill out NaN values if there exists exposed 
def fill_exposed_na():
    """
    :objective: fill out NA values on 'exposed' with mean(exposed)
    :return:pd.Dataframe with adjusted 'exposed' column
    """
    train["exposed"].fillna(train.groupby('방송일시')['exposed'].transform('mean'), inplace = True)

In [47]:
fill_exposed_na()

In [50]:
train.loc[(train.마더코드 == "100148")& (train.exposed != 20)].head()#.iloc[30000:30010]

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp
17646,2019-06-15 10:00:00,,100148,200410,(삼성카드 6월 5%)무이자 LG 울트라HD TV 75UK6200KNB,가전,2990000.0,50000.0,30.0,10
17640,2019-06-15 10:00:00,,100148,200431,무이자 LG 울트라HD TV 55UK6800HNC,가전,1330000.0,3014000.0,30.0,10
17642,2019-06-15 10:00:00,,100148,200440,무이자 LG 울트라HD TV 65UK6800HNC,가전,1920000.0,50000.0,30.0,10
17644,2019-06-15 10:00:00,,100148,200450,무이자 LG 울트라HD TV 70UK7400KNA,가전,2690000.0,50000.0,30.0,10
17639,2019-06-15 10:00:00,30.0,100148,200498,일시불 LG 울트라HD TV 55UK6800HNC,가전,1200000.0,10762000.0,30.0,10


In [49]:
# check no Nan
train.exposed.isna().sum()

0

# ymd

In [51]:
train['ymd'] = [d.date() for d in train["방송일시"]]
ts_schedule = train.copy().groupby('방송일시').first()
ts_schedule.reset_index(inplace = True)

In [52]:

def get_ymd():
    """
    :objective: add 'ymd' variable to train dataset
    :return: pandas dataframe 
    """
    t = 1
    while t < 9:
        for i in ts_schedule.ymd.unique():
            if i == datetime.date(2019,1,1): continue
            time_idx = ts_schedule[ts_schedule.ymd == i].index[0]
            first_show = ts_schedule.iloc[time_idx]
            last_show = ts_schedule.iloc[time_idx - 1]
            if (first_show['마더코드'] == last_show['마더코드']) & (first_show['방송일시'] <= last_show['방송일시'] + datetime.timedelta(minutes=last_show['exposed'])):
                ts_schedule.ymd.iloc[time_idx] = ts_schedule.ymd.iloc[time_idx - 1]

        t = t + 1


In [53]:
get_ymd()

In [54]:
ts_schedule[ts_schedule.방송일시.dt.date != ts_schedule.ymd]

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,ymd
3989,2019-03-11,20.0,100664,201997,(ARS10%)크로커다일 풀커버 레이스 브라팬티 4세트,속옷,79900.0,28283000.0,20.0,0,2019-03-10
4343,2019-03-17,20.0,100695,202058,남영비비안 소노르 베르사유 노와이어 컬렉션 브라,속옷,99000.0,90607000.0,20.0,0,2019-03-16
4402,2019-03-18,20.0,100808,202378,CERINI by PAT 남성 스프링 릴렉스 팬츠 3종,의류,69900.0,38793000.0,20.0,0,2019-03-17
4745,2019-03-24,20.0,100577,201704,무이자 삼성 노트북 9 메탈 고급형 NT900X5J-K28,가전,1349000.0,36545000.0,20.0,0,2019-03-23
4798,2019-03-25,20.0,100346,201069,테이트 남성 SS 트렌치재킷,의류,69900.0,10695000.0,20.0,0,2019-03-24
...,...,...,...,...,...,...,...,...,...,...,...
21279,2019-12-28,20.0,100324,201098,온라인투어 북경 (191227) 상담예약,무형,,,20.0,0,2019-12-27
21340,2019-12-29,20.0,100324,201098,온라인투어 북경 (191227) 상담예약,무형,,,20.0,0,2019-12-28
21398,2019-12-30,20.0,100182,200612,무이자 선일금고 이볼브 시리즈 EV-020,생활용품,440000.0,3116000.0,20.0,0,2019-12-29
21461,2019-12-31,20.0,100610,201883,푸마 스트레치 심리스 드로즈 11종,속옷,89000.0,45063000.0,20.0,0,2019-12-30


# timeslot

In [55]:
def timeslot():
    """
    :objective: get timeslot of each show
    """
    show_counts = [len(list(y)) for x, y in itertools.groupby(ts_schedule.상품코드)] # count repeated 상품코드
    ts_schedule['parttime']  =  "" # define empty column
    j = 0
    for i in range(0,len(show_counts)):
        first_idx = j
        ts_schedule.parttime[first_idx] = 1
        j += show_counts[i]
        if show_counts[i] == 1: next
        ts_schedule.parttime[(first_idx+1):j] = np.arange(2,show_counts[i]+1)
    
    train['parttime'] = "" #define empty column
    #add timeslot variable to train dataset
    for i in range(0, len(ts_schedule)):
        train.parttime[train.방송일시 == ts_schedule.방송일시[i]] = ts_schedule.parttime[i]

In [56]:
timeslot()

# show_id

In [57]:
def get_show_id():
    """
    :objective: get show id for each day
    :return: pandas dataframe
    """
    ts_schedule['show_counts'] = ""
    for i in ts_schedule.ymd.unique():
        rtn = ts_schedule[ts_schedule.ymd == i]
        slot_count = 0 #number of shows for each day
        for j in range(0,len(rtn)):
            if rtn['parttime'].iloc[j] ==  1:
                slot_count += 1
                idx = ts_schedule[ts_schedule.ymd == i].index[j]
                ts_schedule.show_counts.iloc[idx] = str(i) + " "+ str(slot_count)

In [58]:
get_show_id()

In [59]:
ts_schedule.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,ymd,parttime,show_counts
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,2099000.0,20.0,6,2019-01-01,1,2019-01-01 1
1,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,3262000.0,20.0,6,2019-01-01,2,
2,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,6672000.0,20.0,7,2019-01-01,3,
3,2019-01-01 07:00:00,20.0,100305,200974,오모떼 레이스 파운데이션 브라,속옷,59000.0,6819000.0,20.0,7,2019-01-01,1,2019-01-01 2
4,2019-01-01 07:20:00,20.0,100305,200974,오모떼 레이스 파운데이션 브라,속옷,59000.0,15689000.0,20.0,7,2019-01-01,2,


# min_range

In [60]:
def get_min_range():
    """
    :objective: get minutes aired for each show
    :return: pandas dataframe
    """
    ts_schedule['min_range'] = ""
    for i in range(0,len(ts_schedule)):
        if ts_schedule.parttime.iloc[i] == 1:
            min_dur = ts_schedule.exposed.iloc[i]
            j = i + 1
            if j == (len(ts_schedule)): break
            while ts_schedule.parttime.iloc[j] != 1:
                min_dur += ts_schedule.exposed.iloc[j]
                j += 1
                if j == (len(ts_schedule)): break
        ts_schedule.min_range.iloc[i:j] = min_dur

In [61]:
get_min_range()

In [62]:
sum(ts_schedule.min_range.isna())

0

### apply show_id & min_range to train data

In [63]:
def add_showid_minran_to_train():
    """
    :objective: add show_id and min_range column to train data
    :return: pandas dataframe
    """
    train['min_range'] = ""
    train['show_id'] = ""
    for i in ts_schedule[ts_schedule['show_counts'] != ""].index:
        show_id = ts_schedule.show_counts.iloc[i]
        time_slot = ts_schedule.방송일시.iloc[i]
        minrange =  ts_schedule.min_range.iloc[i]
        idx = train[(train.방송일시 >= time_slot) & (train.방송일시 < time_slot + datetime.timedelta(minutes=minrange))].index
        train.show_id.iloc[idx] = show_id
        train.min_range.iloc[idx] = minrange


In [64]:
add_showid_minran_to_train()

In [65]:
train.tail(10)

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,ymd,parttime,min_range,show_id
38299,2019-12-31 23:40:00,,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000.0,10157000.0,20.0,0,2019-12-31,2,60,2019-12-31 20
38300,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000.0,50929000.0,20.0,0,2020-01-01,3,60,2019-12-31 20
38302,2020-01-01 00:00:00,,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000.0,13765000.0,20.0,0,2020-01-01,3,60,2019-12-31 20
38301,2020-01-01 00:00:00,,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000.0,104392000.0,20.0,0,2020-01-01,3,60,2019-12-31 20
38303,2020-01-01 00:00:00,,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000.0,46608000.0,20.0,0,2020-01-01,3,60,2019-12-31 20
38304,2020-01-01 00:20:00,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,,,20.0,0,2020-01-01,1,60,2020-01-01 1
38305,2020-01-01 00:40:00,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,,,20.0,1,2020-01-01,2,60,2020-01-01 1
38306,2020-01-01 01:00:00,20.0,100073,200196,삼성화재 행복한파트너 주택화재보험(1912),무형,,,20.0,1,2020-01-01,3,60,2020-01-01 1
38307,2020-01-01 01:20:00,20.0,100490,201478,더케이 예다함 상조서비스(티포트),무형,,,20.0,1,2020-01-01,1,37,2020-01-01 2
38308,2020-01-01 01:40:00,17.0,100490,201478,더케이 예다함 상조서비스(티포트),무형,,,17.0,2,2020-01-01,2,37,2020-01-01 2


# volume

In [66]:
train['volume'] = train['취급액']/train['판매단가']

# sales.power

In [67]:
def get_sales_power():
    """
    :objective: get sales power of each product, sum(exposed time)/sum(sales volume)
    """
    train['sales_power'] = ""
    bp = train.groupby('상품코드').exposed.sum()/train.groupby('상품코드').volume.sum()
    for i in bp.index:
        train.sales_power.loc[train.상품코드 == i] = bp.loc[i]

In [68]:
get_sales_power()

In [69]:
train.head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,ymd,parttime,min_range,show_id,volume,sales_power
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,2099000.0,20.0,6,2019-01-01,1,60,2019-01-01 1,52.606516,0.1337
1,2019-01-01 06:00:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,4371000.0,20.0,6,2019-01-01,1,60,2019-01-01 1,109.548872,0.0722394
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,3262000.0,20.0,6,2019-01-01,2,60,2019-01-01 1,81.754386,0.1337
3,2019-01-01 06:20:00,,100346,201079,테이트 여성 셀린니트3종,의류,39900.0,6955000.0,20.0,6,2019-01-01,2,60,2019-01-01 1,174.310777,0.0722394
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,6672000.0,20.0,7,2019-01-01,3,60,2019-01-01 1,167.218045,0.1337


# men

In [70]:
def check_men_items():
    """
    :objective: create a dummy variable to identify products for men
    """
    mens_category = ["의류", "이미용", "잡화", "속옷"] #  only for these categories
    train['men'] = ""
    train.men[train['상품군'].isin(mens_category)] = 0
    train.men[train['상품군'].isin(mens_category) & train['상품명'].str.contains("남성")] = 1

In [71]:
check_men_items()

In [72]:
train[train.men  ==  1].head()

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,ymd,parttime,min_range,show_id,volume,sales_power,men
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,2099000.0,20.0,6,2019-01-01,1,60,2019-01-01 1,52.606516,0.1337,1
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,3262000.0,20.0,6,2019-01-01,2,60,2019-01-01 1,81.754386,0.1337,1
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900.0,6672000.0,20.0,7,2019-01-01,3,60,2019-01-01 1,167.218045,0.1337,1
9,2019-01-01 08:00:00,20.0,100808,202377,CERINI by PAT 남성 소프트 기모 릴렉스팬츠,의류,59900.0,16133000.0,20.0,8,2019-01-01,1,60,2019-01-01 3,269.33222,0.0277619,1
10,2019-01-01 08:20:00,20.0,100808,202377,CERINI by PAT 남성 소프트 기모 릴렉스팬츠,의류,59900.0,30061000.0,20.0,8,2019-01-01,2,60,2019-01-01 3,501.853088,0.0277619,1


# hours

In [73]:
def get_hour():
    """
    :objective: get hour
    """
    train['hours'] = train.방송일시.dt.hour

In [74]:
get_hour()

# freq

In [75]:
def freq_items():
    """
    :objective: identify frequently sold items by dummy variable "freq"
    """
    # define top ten frequently sold items list
    freq_list = train.groupby('상품코드').show_id.nunique().sort_values(ascending=False).index[1:10]
    train['freq'] = 0
    train.freq.loc[train.상품코드.isin(freq_list)] = 1

In [76]:
freq_items()

# drop na

In [77]:
def drop_na():
    """
    :objective: drop na rows and 취급액 == 50000
    """
    rtn = train[train['취급액'].notna()]
    rtn = rtn[rtn['취급액']!= 50000]
    return rtn

In [78]:
drop_na().shape

(35379, 19)

In [79]:
train = drop_na()

# category added

In [84]:
def add_categories():
    """
    :objective: add category columns
    :return: pandas dataframe
    """
    categories = pd.read_excel("../data/01/2019sales_added.xlsx")
    categories.상품코드 = categories.상품코드.dropna().astype(int).astype(str).str.zfill(6)
    categories.방송일시 = pd.to_datetime(categories.방송일시, format="%Y/%m/%d %H:%M")
    categories.sort_values(['방송일시', '상품코드'], ascending=[True, True], inplace=True)
    categories.rename(columns={' 취급액 ': '취급액'}, inplace=True)
    categories = categories[categories['취급액'].notna()]
    categories = categories[categories['취급액']!= 50000]
    output = pd.merge(left=train,
                      right=categories[['방송일시', '상품코드', 'brand', 'original_c', 'small_c' 
                                        ,'small_c_code','middle_c', 'middle_c_code', 'big_c']],
                      how='left', on=['방송일시', '상품코드'], sort=False)
    return output

In [85]:
train = add_categories()

# weekday

In [87]:
def get_weekday():
    """
    :objective: get weekday
    """
    train['weekday'] = train.방송일시.dt.weekday_name

In [88]:
get_weekday()

# steady sellers

In [90]:
def check_steady_sellers():
    """
    :objective: check if it is included in top 40(by total sales)
    """
    steady_list = train.groupby('상품코드')\
        .apply(lambda x: sum(x.취급액)/x.show_id.nunique()).sort_values(ascending = False).index[1:40]
    train['steady'] = 0
    train.steady.loc[train.상품코드.isin(steady_list)] = 1

In [91]:
check_steady_sellers()

In [57]:
round(train.취급액[train.steady ==  1].sum(skipna=True)/train.취급액.sum(skipna=True)*100,3)

8.519

# brand power

In [93]:
def check_brand_power():
    """
    :objective: identify items with low sales power(+) & high price
    """
    bpower_list = train.마더코드.loc[(train.sales_power > train.sales_power.quantile(0.7)) &
         (train.판매단가 > train.판매단가.quantile(0.7))].unique()
    train['bpower'] = 0
    train.bpower.loc[train.마더코드.isin(bpower_list)] = 1

In [94]:
check_brand_power()

# dup_times

In [95]:
def get_dup_times():
    """
    :objective: get # of shows within the same category in a day
    """
    train['dup_times'] = ""
    dup_times_list = train.groupby(['ymd','상품군'])\
        .show_id.nunique()
    for ymd_idx, cate_idx in dup_times_list.index:
        val = dup_times_list.loc[([(ymd_idx, cate_idx)])].values[0]
        train.dup_times.loc[(train.ymd == ymd_idx) & (train.상품군 == cate_idx)] = val

In [96]:
get_dup_times()

## alternative

eda에서 확인해봤을 때, 가구/침구는 방송횟수 한 번만 더해도 매출 감소.
다른 카테고리는 오히려 더 방송할 수록 매출 많?

# dup_times_smallc

In [97]:
def get_dup_times_smallc():
    """
    :objective: get # of shows within the same small_c in a day
    """
    train['dup_times'] = ""
    dup_times_small_list = train.groupby(['ymd','small_c'])\
        .show_id.nunique()
    for ymd_idx, cate_idx in dup_times_list.index:
        val = dup_times_small_list.loc[([(ymd_idx, cate_idx)])].values[0]
        train.dup_times.loc[(train.ymd == ymd_idx) & (train.상품군 == cate_idx)] = val

# primetime(by original_c)

In [98]:
def check_originalc_primet():
    """
    :objective: return 1 if its hour is within its original c's primetime
    """
    train['prime_origin'] = ""
    hours_originalc = train.groupby(['hours', 'original_c'])\
        ['취급액'].sum().rename("tot_sales").groupby(level=0, group_keys=False)
    hours_originalc_list = hours_originalc.nlargest(2)
    for hr, original_c_nm in hours_originalc_list.index:
        train.prime_origin.loc[(train.hours == hr) & (train.original_c == original_c_nm)] = 1

In [99]:
check_originalc_primet()

# primetime(by small_c)

In [100]:
def check_smallc_primet():
    """
    :objective: return 1 if its hour is within its small c's primetime
    """
    train['prime_smallc'] = ""
    hours_smallc = train.groupby(['hours', 'small_c'])\
        ['취급액'].sum().rename("tot_sales").groupby(level=0, group_keys=False)
    hours_smallc_list = hours_smallc.nlargest(2)
    for hr, small_c_nm in hours_smallc_list.index:
        train.prime_smallc.loc[(train.hours == hr) & (train.small_c == small_c_nm)] = 1

In [101]:
check_smallc_primet()

# spring/summer/fall/winter

In [102]:
def get_season_items():
    """
    :objective: create dummy vars(spring,summer,fall,winter) for seasonal items
    """
    with open("../data/11/seasonal.json") as json_file:
        seasonal_items = json.load(json_file)
    train['spring'] = 0
    train['summer'] = 0
    train['fall'] = 0
    train['winter'] = 0
    train.spring.loc[train['original_c'].isin(seasonal_items['spring'])] = 1
    train.summer.loc[train['original_c'].isin(seasonal_items['summer'])] = 1
    train.fall.loc[train['original_c'].isin(seasonal_items['fall'])] = 1
    train.winter.loc[train['original_c'].isin(seasonal_items['winter'])] = 1

In [103]:
get_season_items()

In [114]:
train.columns

Index(['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액', 'exposed',
       'japp', 'ymd', 'parttime', 'min_range', 'show_id', 'volume',
       'sales_power', 'men', 'hours', 'freq', 'brand', 'original_c', 'small_c',
       'small_c_code', 'middle_c', 'middle_c_code', 'big_c', 'weekday',
       'steady', 'bpower', 'dup_times', 'prime_origin', 'prime_smallc',
       'spring', 'summer', 'fall', 'winter'],
      dtype='object')

In [123]:
train.loc[(train.마더코드 == "100148")][220:240]

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,exposed,japp,...,weekday,steady,bpower,dup_times,prime_origin,prime_smallc,spring,summer,fall,winter
3955,2019-02-13 19:00:00,,100148,200439,무이자 LG 울트라HD TV 65UK6800HNC,가전,2130000.0,5499000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3956,2019-02-13 19:00:00,,100148,200448,무이자 LG 울트라HD TV 70UK6800HNC,가전,2990000.0,7710000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3957,2019-02-13 19:00:00,20.0,100148,200497,일시불 LG 울트라HD TV 55UK6800HNC,가전,1300000.0,6682000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3958,2019-02-13 19:00:00,,100148,200506,일시불 LG 울트라HD TV 65UK6800HNC,가전,1900000.0,4908000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3959,2019-02-13 19:20:00,,100148,200439,무이자 LG 울트라HD TV 65UK6800HNC,가전,2130000.0,11000000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3960,2019-02-13 19:20:00,20.0,100148,200497,일시불 LG 울트라HD TV 55UK6800HNC,가전,1300000.0,10023000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3961,2019-02-13 19:20:00,,100148,200506,일시불 LG 울트라HD TV 65UK6800HNC,가전,1900000.0,4934000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3962,2019-02-13 19:20:00,,100148,200515,일시불 LG 울트라HD TV 70UK6800HNC,가전,2700000.0,6964000.0,20.0,19,...,Wednesday,0,1,2,,,0,0,0,1
3963,2019-02-13 19:40:00,,100148,200430,무이자 LG 울트라HD TV 55UK6800HNC,가전,1440000.0,11103000.0,20.0,20,...,Wednesday,0,1,2,,,0,0,0,1
3964,2019-02-13 19:40:00,,100148,200439,무이자 LG 울트라HD TV 65UK6800HNC,가전,2130000.0,10948000.0,20.0,20,...,Wednesday,0,1,2,,,0,0,0,1


# prev_time_sales

train['prev_time_sales'] = ""
dup_times_list = train.groupby(['ymd','상품군'])\
    .show_id.nunique()
for ymd_idx, cate_idx in dup_times_list.index:
    val = dup_times_list.loc[([(ymd_idx, cate_idx)])].values[0]
    train.dup_times.loc[(train.ymd == ymd_idx) & (train.상품군 == cate_idx)] = val

In [235]:
train.groupby('show_id').취급액.sum().index

Index(['', '2019-01-01 1', '2019-01-01 10', '2019-01-01 11', '2019-01-01 12',
       '2019-01-01 13', '2019-01-01 14', '2019-01-01 15', '2019-01-01 16',
       '2019-01-01 17',
       ...
       '2019-12-31 18', '2019-12-31 2', '2019-12-31 20', '2019-12-31 3',
       '2019-12-31 4', '2019-12-31 5', '2019-12-31 6', '2019-12-31 7',
       '2019-12-31 8', '2019-12-31 9'],
      dtype='object', name='show_id', length=6971)

In [247]:
train.groupby('show_id').취급액.sum().index

Index(['', '2019-01-01 1', '2019-01-01 10', '2019-01-01 11', '2019-01-01 12',
       '2019-01-01 13', '2019-01-01 14', '2019-01-01 15', '2019-01-01 16',
       '2019-01-01 17',
       ...
       '2019-12-31 18', '2019-12-31 2', '2019-12-31 20', '2019-12-31 3',
       '2019-12-31 4', '2019-12-31 5', '2019-12-31 6', '2019-12-31 7',
       '2019-12-31 8', '2019-12-31 9'],
      dtype='object', name='show_id', length=6971)

In [250]:
train.show_id.tail(10)

35369    2019-12-31 20
35370    2019-12-31 20
35371    2019-12-31 20
35372    2019-12-31 20
35373    2019-12-31 20
35374    2019-12-31 20
35375    2019-12-31 20
35376    2019-12-31 20
35377    2019-12-31 20
35378    2019-12-31 20
Name: show_id, dtype: object

# prev_time_originalc

# tv ratings

업데이트 예정

In [857]:
aud_use_week = pd.read_csv("../data/aud_use_week.csv")
aud_use_weekend  = pd.read_csv("../data/aud_use_weekend.csv")

In [146]:
# import libraries
# stat
import pandas as pd
import numpy as np
import math
import random

# data
import datetime
import itertools
import json

class Features:
    def __init__(self):
        ## load data
        self.train = pd.read_csv("../data/00/2019sales.csv", skiprows = 1)
        self.train.rename(columns={' 취급액 ': '취급액'}, inplace = True)
        self.train['exposed']  = self.train['노출(분)']
        # define data types
        self.train.마더코드 = self.train.마더코드.astype(int).astype(str).str.zfill(6)
        self.train.상품코드 = self.train.상품코드.astype(int).astype(str).str.zfill(6)
        self.train.취급액 = self.train.취급액.str.replace(",","").astype(float)
        self.train.판매단가 = self.train.판매단가.str.replace(",","").replace(' - ', np.nan).astype(float)
        self.train.방송일시 = pd.to_datetime(self.train.방송일시, format="%Y/%m/%d %H:%M")
        self.train.sort_values(['방송일시', '상품코드'], ascending=[True, True], inplace = True)
        self.train['ymd'] = [d.date() for d in self.train["방송일시"]]
        self.train['volume'] = self.train['취급액'] / self.train['판매단가']
        # define ts_schedule, one row for each timeslot
        self.ts_schedule = self.train.copy().groupby('방송일시').first()
        self.ts_schedule.reset_index(inplace = True)

    ##################################
    ## onair time/order info variables
    ##################################

    def get_time(self):
        """
        :** objective: get year, month, day, hours
        """
        self.train['years'] = self.train.방송일시.dt.year
        self.train['months'] = self.train.방송일시.dt.month
        self.train['days'] = self.train.방송일시.dt.day
        self.train['hours'] = self.train.방송일시.dt.hour

    def get_weekday(self):
        """
        :** objective: get weekday
        """
        self.train['weekdays'] = self.train.방송일시.dt.day_name()

    def get_hours_inweek(self):
        """
        :** objective: get hours by week (1~168)
        """
        hours_inweek = []
        for i in range(0, len(self.train)):
            hr = self.train['hours'].iloc[i]
            dy = self.train['weekdays'].iloc[i]
            if dy == 'Tuesday' :
                hours_inweek.append(hr+24)
            elif dy == 'Wednesday' :
                hours_inweek.append(hr+24*2)
            elif dy == 'Thursday' :
                hours_inweek.append(hr+24*3)
            elif dy == 'Friday' :
                hours_inweek.append(hr+24*4)
            elif dy == 'Saturday' :
                hours_inweek.append(hr+24*5)
            elif dy == 'Sunday' :
                hours_inweek.append(hr+24*6)
            else :
                hours_inweek.append(hr)
        self.train['hours_inweek'] = hours_inweek

    def get_holidays(self):
        """
        :** objective: create a dummy variable for holidays (weekends + red)
        """
        holidays = []
        holiday_dates = ['2019-01-01', '2019-02-04','2019-02-05','2019-02-06',
                          '2019-03-01','2019-05-06','2019-06-06','2019-08-15',
                          '2019-09-12','2019-09-13','2019-10-03','2019-10-09',
                          '2019-12-25']
        for i in range(0, len(self.train)):
            dt = str(self.train['ymd'].iloc[i])
            dy = self.train['weekdays'].iloc[i]
            if dt in holiday_dates or dy == 'Saturday' or dy == 'Sunday':
                holidays.append(1)
            else: holidays.append(0)
        self.train['holidays'] = holidays

    def get_red_days(self):
        """
        :** objective: create a dummy variable for just red
        """
        red = []
        holiday_dates = ['2019-01-01', '2019-02-04','2019-02-05','2019-02-06',
                          '2019-03-01','2019-05-06','2019-06-06','2019-08-15',
                          '2019-09-12','2019-09-13','2019-10-03','2019-10-09',
                          '2019-12-25']
        for i in range(0, len(self.train)):
            dt = str(self.train['ymd'].iloc[i])
            if dt in holiday_dates:
                red.append(1)
            else: red.append(0)
        self.train['red'] = red

    def get_weekends(self):
        """
        :** objective: create a dummy variable for just weekends
        """
        self.train['weekends'] = 0
        self.train.loc[(self.train['red']==0) & (self.train['holidays']==1),'weekends'] =1


    def get_min_start(self):
        """
        :** objective: get startig time (min)
        """
        self.train['min_start'] = self.train.방송일시.dt.minute
        #list(set(train.방송일시.dt.minute)) #unique

    def filter_jappingt(self):
        """
        :objective: round up 방송일시
        """
        japp = []
        for i in range(0, len(self.train)):
            time = self.train['방송일시'].iloc[i]
            if (time.minute < 30) & (time.hour == 0):
                rtn = time.hour
            elif time.minute >= 30:
                if time.hour == 23: rtn = 0
                else: rtn = time.hour + 1
            else:
                if time.hour == 0: rtn = 23
                else: rtn = time.hour
            japp.append(rtn)
        self.train['japp'] = japp

    def fill_exposed_na(self):
        """
        :objective: fill out NA values on 'exposed' with mean(exposed)
        :return:pd.Dataframe with adjusted 'exposed' column
        """
        self.train["exposed"].fillna(self.train.groupby('방송일시')['exposed'].transform('mean'), inplace = True)

    def get_ymd(self):
        """
        :objective: add 'ymd' variable to train dataset
        :return: pandas dataframe
        """
        t = 1
        while t < 9:
            for i in self.ts_schedule.ymd.unique():
                if i == datetime.date(2019,1,1): continue
                time_idx = self.ts_schedule[self.ts_schedule.ymd == i].index[0]
                first_show = self.ts_schedule.iloc[time_idx]
                last_show = self.ts_schedule.iloc[time_idx - 1]
                if (first_show['마더코드'] == last_show['마더코드']) & (first_show['방송일시'] <= last_show['방송일시'] + datetime.timedelta(minutes=last_show['exposed'])):
                    self.ts_schedule.ymd.iloc[time_idx] = self.ts_schedule.ymd.iloc[time_idx - 1]

            t = t + 1

    def timeslot(self):
        """
        :objective: get timeslot of each show
        """
        show_counts = [len(list(y)) for x, y in itertools.groupby(self.ts_schedule.상품코드)]  # count repeated 상품코드
        self.ts_schedule['parttime'] = ""  # define empty column
        j = 0
        for i in range(0, len(show_counts)):
            first_idx = j
            self.ts_schedule.parttime[first_idx] = 1
            j += show_counts[i]
            if show_counts[i] == 1:
                next
            self.ts_schedule.parttime[(first_idx + 1):j] = np.arange(2, show_counts[i] + 1)

        self.train['parttime'] = ""  # define empty column
        # add timeslot variable to train dataset
        for i in range(0, len(self.ts_schedule)):
            self.train.parttime[self.train.방송일시 == self.ts_schedule.방송일시[i]] = self.ts_schedule.parttime[i]

    def get_show_id(self):
        """
        :objective: get show id for each day
        :return: pandas dataframe
        """
        self.ts_schedule['show_counts'] = ""
        for i in self.ts_schedule.ymd.unique():
            rtn = self.ts_schedule[self.ts_schedule.ymd == i]
            slot_count = 0 #number of shows for each day
            for j in range(0,len(rtn)):
                if rtn['parttime'].iloc[j] ==  1:
                    slot_count += 1
                    idx = self.ts_schedule[self.ts_schedule.ymd == i].index[j]
                    self.ts_schedule.show_counts.iloc[idx] = str(i) + " "+ str(slot_count)

    def get_min_range(self):
        """
        :objective: get minutes aired for each show
        :return: pandas dataframe
        """
        self.ts_schedule['min_range'] = ""
        for i in range(0,len(self.ts_schedule)):
            if self.ts_schedule.parttime.iloc[i] == 1:
                min_dur = self.ts_schedule.exposed.iloc[i]
                j = i + 1
                if j == (len(self.ts_schedule)): break
                while self.ts_schedule.parttime.iloc[j] != 1:
                    min_dur += self.ts_schedule.exposed.iloc[j]
                    j += 1
                    if j == (len(self.ts_schedule)): break
            self.ts_schedule.min_range.iloc[i:j] = min_dur

    def add_showid_minran_to_train(self):
        """
        :objective: add show_id and min_range column to train data
        :return: pandas dataframe
        """
        self.train['min_range'] = ""
        self.train['show_id'] = ""
        for i in self.ts_schedule[self.ts_schedule['show_counts'] != ""].index:
            show_id = self.ts_schedule.show_counts.iloc[i]
            time_slot = self.ts_schedule.방송일시.iloc[i]
            minrange = self.ts_schedule.min_range.iloc[i]
            idx = self.train[(self.train.방송일시 >= time_slot) & (self.train.방송일시 < time_slot + datetime.timedelta(minutes=minrange))].index
            self.train.show_id.iloc[idx] = show_id
            self.train.min_range.iloc[idx] = minrange


    ############################
    ## primetime
    ############################
    def get_primetime(self):
        """
        :**objective: get primetime for week and weekends respectively
        """
        self.train['primetime'] = 0
        prime_week = [9,10,11]
        prime_week2 = [16,17,18]
        prime_weekend = [7,8,9]
        prime_weekend2 = [13,15,16,17]

        self.train.loc[(self.train['red']==0) & (self.train['holidays']==1) & (self.train['hours'].isin(prime_weekend)),'primetime'] =1
        self.train.loc[(self.train['red']==0) & (self.train['holidays']==1) & (self.train['hours'].isin(prime_weekend2)),'primetime'] = 2

        self.train.loc[(self.train['holidays']==0) & (self.train['hours'].isin(prime_week)),'primetime'] = 1
        self.train.loc[(self.train['holidays']==0) & (self.train['hours'].isin(prime_week2)),'primetime'] = 2

    def check_originalc_primet(self):
        """
        :objective: return 1 if its hour is within its original c's primetime
        """
        self.train['prime_origin'] = ""
        hours_originalc = self.train.groupby(['hours', 'original_c']) \
            ['취급액'].sum().rename("tot_sales").groupby(level=0, group_keys=False)
        hours_originalc_list = hours_originalc.nlargest(2)
        for hr, original_c_nm in hours_originalc_list.index:
            self.train.prime_origin.loc[(self.train.hours == hr) & (self.train.original_c == original_c_nm)] = 1

    def check_smallc_primet(self):
        """
        :objective: return 1 if its hour is within its small c's primetime
        """
        self.train['prime_smallc'] = ""
        hours_smallc = self.train.groupby(['hours', 'small_c']) \
            ['취급액'].sum().rename("tot_sales").groupby(level=0, group_keys=False)
        hours_smallc_list = hours_smallc.nlargest(2)
        for hr, small_c_nm in hours_smallc_list.index:
            self.train.prime_smallc.loc[(self.train.hours == hr) & (self.train.small_c == small_c_nm)] = 1

    ############################
    ## sales/volume power variables
    ############################
    def get_sales_power(self):
        """
        :objective: get sales power of each product, sum(exposed time)/sum(sales volume)
        """
        self.train['sales_power'] = ""
        bp = self.train.groupby('상품코드').exposed.sum()/self.train.groupby('상품코드').volume.sum()
        for i in bp.index:
            self.train.sales_power.loc[self.train.상품코드 == i] = bp.loc[i]

    def freq_items(self):
        """
        :objective: identify frequently sold items by dummy variable "freq"
        """
        # define top ten frequently sold items list
        freq_list = self.train.groupby('상품코드').show_id.nunique().sort_values(ascending=False).index[1:10]
        self.train['freq'] = 0
        self.train.freq.loc[self.train.상품코드.isin(freq_list)] = 1

    def check_steady_sellers(self):
        """
        :objective: check if it is included in top 40(by total sales)
        """
        steady_list = self.train.groupby('상품코드') \
                          .apply(lambda x: sum(x.취급액) / x.show_id.nunique()).sort_values(ascending=False).index[1:40]
        self.train['steady'] = 0
        self.train.steady.loc[self.train.상품코드.isin(steady_list)] = 1

    def check_brand_power(self):
        """
        :objective: identify items with low sales power(+) & high price
        """
        bpower_list = self.train.마더코드.loc[(self.train.sales_power > self.train.sales_power.quantile(0.7)) &
                                     (self.train.판매단가 > self.train.판매단가.quantile(0.7))].unique()
        self.train['bpower'] = 0
        self.train.bpower.loc[self.train.마더코드.isin(bpower_list)] = 1

    ############################
    ## Other characteristics
    ############################
    def check_men_items(self):
        """
        :objective: create a dummy variable to identify products for men
        """
        mens_category = ["의류", "이미용", "잡화", "속옷"]  # only for these categories
        self.train['men'] = ""
        self.train.men[self.train['상품군'].isin(mens_category)] = 0
        self.train.men[self.train['상품군'].isin(mens_category) & self.train['상품명'].str.contains("남성")] = 1

    def check_luxury_items(self):
        """
        :**objective: create a dummy variable to identify products with selling price >= 490,000
        """
        self.train['luxury'] = 0
        self.train.loc[self.train['판매단가']>=490000, 'luxury'] = 1

    def check_pay(self):
        """
        :**objective: create 3 factor variable to identify payment methods ('ilsibul','muiza','none')
        """
        pay = []
        for i in range(0,len(self.train)) :
            word = self.train['상품명'].iloc[i]
            if '(일)' in word or '일시불' in word :
                pay.append('ilsibul')
            elif '(무)' in word or '무이자' in word :
                pay.append('muiza')
            else :
                pay.append('none')
        self.train['pay'] = pay

    def get_dup_times(self):
        """
        :objective: get # of shows within the same category in a day
        """
        self.train['dup_times'] = ""
        dup_times_list = self.train.groupby(['ymd', '상품군']) \
            .show_id.nunique()
        for ymd_idx, cate_idx in dup_times_list.index:
            val = dup_times_list.loc[([(ymd_idx, cate_idx)])].values[0]
            self.train.dup_times.loc[(self.train.ymd == ymd_idx) & (self.train.상품군 == cate_idx)] = val

    def get_dup_times_smallc(self):
        """
        :objective: get # of shows within the same small_c in a day
        """
        self.train['dup_times'] = ""
        dup_times_small_list = self.train.groupby(['ymd', 'small_c']) \
            .show_id.nunique()
        for ymd_idx, cate_idx in dup_times_small_list.index:
            val = dup_times_small_list.loc[([(ymd_idx, cate_idx)])].values[0]
            self.train.dup_times.loc[(self.train.ymd == ymd_idx) & (self.train.상품군 == cate_idx)] = val


    ############################
    ## External information
    ############################
    def add_categories(self):
        """
        :objective: add category columns
        :return: pandas dataframe
        """
        categories = pd.read_excel("../data/01/2019sales_added.xlsx")
        categories.상품코드 = categories.상품코드.dropna().astype(int).astype(str).str.zfill(6)
        categories.방송일시 = pd.to_datetime(categories.방송일시, format="%Y/%m/%d %H:%M")
        categories.sort_values(['방송일시', '상품코드'], ascending=[True, True], inplace=True)
        categories.rename(columns={' 취급액 ': '취급액'}, inplace=True)
        self.train = pd.merge(left=self.train,
                          right=categories[['방송일시', '상품코드', 'brand', 'original_c', 'small_c', 'small_c_code','middle_c','middle_c_code','big_c']],
                          how='inner', on=['방송일시', '상품코드'], sort=False)

    def add_vratings(self):
        """
        :**objective: add vratings by rate mean
        """
        onair = pd.read_csv("../data/vrating_defined.csv")
        onair.상품코드 = onair.상품코드.dropna().astype(int).astype(str).str.zfill(6)
        onair['방송일시'] = onair[['DATE','TIME']].agg(' '.join, axis=1)
        onair['방송일시'] = pd.to_datetime(onair.방송일시, format="%Y/%m/%d %H:%M")
        onair.sort_values(['방송일시', '상품코드'], ascending=[True, True], inplace=True)

        #impute rate mean nan
        random.seed(100)
        for i in range(0,len(self.train)):
            if math.isnan(onair.iloc[i,1]):
                onair['rate_mean'].iloc[i] = onair['rate_mean'].iloc[i-1]
            else:
                continue

        # add noise to zero values
        for i in range(0,len(self.train)):
            val = onair['rate_mean'].iloc[i]
            if val == 0 :
                onair['rate_mean'].iloc[i] = np.random.uniform(0,1,1)[0]/1000000
        rate_mean = onair['rate_mean']
        self.train['vratings'] = rate_mean

    def get_season_items(self):
        """
        :objective: create dummy vars(spring,summer,fall,winter) for seasonal items
        """
        with open("../data/11/seasonal.json") as json_file:
            seasonal_items = json.load(json_file)
        self.train['spring'] = 0
        self.train['summer'] = 0
        self.train['fall'] = 0
        self.train['winter'] = 0
        self.train.spring.loc[self.train['original_c'].isin(seasonal_items['spring'])] = 1
        self.train.summer.loc[self.train['original_c'].isin(seasonal_items['summer'])] = 1
        self.train.fall.loc[self.train['original_c'].isin(seasonal_items['fall'])] = 1
        self.train.winter.loc[self.train['original_c'].isin(seasonal_items['winter'])] = 1

    def drop_na(self):
        """
        :objective: drop na rows and 취급액 == 50000
        """
        self.train = self.train[self.train['취급액'].notna()]
        self.train = self.train[self.train['취급액']!= 50000]

    def run_all(self):

        self.get_time()
        self.get_weekday()
        self.get_hours_inweek()
        self.get_holidays()
        self.get_red_days()
        self.get_weekends()
        self.get_min_start()

        self.filter_jappingt()
        self.fill_exposed_na()

        self.get_ymd()
        self.timeslot()
        self.get_show_id()
        self.get_min_range()
        self.add_showid_minran_to_train()

        self.drop_na()
        self.add_categories()

        self.get_primetime()
        self.check_originalc_primet()
        self.check_smallc_primet()

        self.get_sales_power()
        self.freq_items()
        self.get_dup_times()
        self.get_dup_times_smallc()

        self.check_brand_power()
        self.check_steady_sellers()
        self.check_men_items()
        self.check_luxury_items()
        self.check_pay()

        #self.add_vratings()
        self.get_season_items()

        return self.train


   

In [147]:
t = Features()

In [148]:
train = t.run_all()

In [151]:
train.isna().sum()

0

In [150]:
train.columns

Index(['방송일시', '노출(분)', '마더코드', '상품코드', '상품명', '상품군', '판매단가', '취급액', 'exposed',
       'ymd', 'volume', 'years', 'months', 'days', 'hours', 'weekdays',
       'hours_inweek', 'holidays', 'red', 'weekends', 'min_start', 'japp',
       'parttime', 'min_range', 'show_id', 'brand', 'original_c', 'small_c',
       'small_c_code', 'middle_c', 'middle_c_code', 'big_c'],
      dtype='object')

In [152]:
train.to_excel("../data/01/2019sales_v2.xlsx")