In [1]:
import pandas as pd
import utils
import numpy as np
import os
import load_data


import matplotlib 
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
import scipy.stats as stats
from collections import Counter

In [2]:
train_path = utils.train_path
df = pd.read_csv(train_path+'train.csv')

In [20]:
df[['time_w','취급액']].corr()

Unnamed: 0,time_w,취급액
time_w,1.0,0.158561
취급액,0.158561,1.0


In [8]:
def processing_time_w_c(df_):
    df_.reset_index(drop=True, inplace=True)

    # local peak series
    # 전체 시간대는 21시로 분할 가능. 하루주기 => 21구간(06 ~ 02시). 
    hour=list(range(0,22,1)) 
    hour_in_day = 22 
    hour_sin  = np.sin([np.pi*i/hour_in_day for i in hour]).tolist() # 구간별 sin값 생성. sin peak: hour_sin[10]


    # 상품군별 local peak, prime time에 대한 sin class number per hour.
    peak_dict ={
        '가전':[6,7,8,9,10,9,8,9,10,9,8,7,7,8,9,10,9,8,7,6,5], # local peak: 10, 14, 21 시.  prime time: 21시
        '농수축':[6,7,8,9,10,9,8,7,7,8,9,10,9,8,8,9,10,9,8,7,6], # local peak: 10, 17, 22 시. prime time: 22시
        '잡화':[9,10,9,8,9,10,9,8,8,9,10,9,9,10,9,8,9,10,9,8,7], # local peak: 7, 11, 16, 19, 23 시. prime time: 19시
        '생활용품':[6,7,8,9,10,9,8,7,7,8,9,10,9,8,9,10,9,8,7,6,5],  # local peak: 10, 17, 21 시. prime time: 21시
        '주방':[5,6,7,8,9,10,9,8,9,10,9,8,7,8,9,10,9,8,7,6,5],  # local peak: 11, 15, 21 시. prime time: 21시
        '이미용':[6,7,8,9,10,9,8,7,8,9,10,9,8,8,9,10,9,8,7,6,5], # local peak: 10, 16, 21 시. prime time: 21시
        '속옷':[6,7,8,9,10,9,8,7,6,5,4,5,6,7,8,9,10,9,8,7,6], # local peak: 10, 22 시. prime time: 22시
        '의류':[6,7,8,9,10,9,8,8,9,10,9,10,9,8,9,10,9,8,7,6,5], # local peak: 10, 15, 17, 21 시. prime time: 10시
        '건강기능':[5,6,7,8,9,10,9,8,7,6,7,8,9,10,9,8,7,6,5,4,3], # local peak: 11, 19 시. prime time: 19시
        '침구':[6,7,8,9,10,9,8,7,6,5,4,5,6,7,8,9,10,9,8,7,6], # local peak: 10, 22시. prime time: 22시
        '가구':[9,10,9,8,7,6,5,4,3,4,5,6,7,8,9,10,9,8,7,6,5], # local peak: 7, 21시. prime time: 21시
    }

    for i in peak_dict:
        peak_dict[i] = {m:hour_sin[n] for m,n in zip([6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2],peak_dict[i])}
        
    df_['시간_w_c'] = 0
    
    for i in df_.index:
        df_.loc[i,'시간_w_c'] = peak_dict[df_.loc[i,'상품군']][df_.loc[i,'시간대']]
        
    return df_

In [9]:
def processing_prime_time_w_c(df_):
    df_.reset_index(drop=True, inplace=True)
    
    # prime peak series
    hour=list(range(0,22,1)) # third: 1 second: 6, summit: 11, second: 16, third: 0
    hour_in_day=22

    hour_sin  = np.sin([np.pi*i/hour_in_day for i in hour]).tolist() # 구간별 sin값 생성. sin peak: hour_sin[10]


    # 상품군별 local peak, prime time에 대한 sin class number per hour.
    prime_peak_dict ={
        '가전':[1,1, 6,6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16, 0,0,0], # local peak: 10, 14, 21 시.  prime time: 21시
        '농수축':[1,1,1, 6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16,16, 0,0], # local peak: 10, 17, 22 시. prime time: 22시
        '잡화':[1, 6,6,6,6,6,6,6,6,6,6,6,6, 11, 16,16,16,16,16, 0,0], # local peak: 7, 11, 16, 19, 23 시. prime time: 19시
        '생활용품':[1,1, 6,6,6,6,6,6,6,6,6,6,6,6,6, 11, 16,16,16, 0,0],  # local peak: 10, 17, 21 시. prime time: 21시
        '주방':[1, 6,6,6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16, 0,0,0],  # local peak: 11, 15, 21 시. prime time: 21시
        '이미용':[1, 6,6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16,16,16, 0,0], # local peak: 10, 16, 21 시. prime time: 21시
        '속옷':[1, 6,6,6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16,16, 0,0], # local peak: 10, 22 시. prime time: 22시
        '의류':[1, 6,6, 11,11,11,11,11,11,11,11,11,11,11,11,11, 16,16,16, 0,0], # local peak: 10, 15, 17, 21 시. prime time: 10시
        '건강기능':[1, 6,6,6,6,6,6,6,6,6,6,6,6, 11, 16,16,16,16,16, 0,0], # local peak: 11, 19 시. prime time: 19시
        '침구':[1,1,1, 6,6,6,6,6,6,6,6,6,6,6,6,6, 11, 16,16, 0,0], # local peak: 10, 22시. prime time: 22시
        '가구':[1, 6,6,6,6,6,6,6,6,6,6,6,6,6,6, 11,11, 16, 0,0,0], # local peak: 7, 21시. prime time: 21시
    }

    for i in prime_peak_dict:
        prime_peak_dict[i] = {m:hour_sin[n] for m,n in zip([6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,0,1,2],prime_peak_dict[i])}

    df_['프라임시간_w_c'] = 0
    
    for i in df_.index:
        df_.loc[i,'프라임시간_w_c'] = prime_peak_dict[df_.loc[i,'상품군']][df_.loc[i,'시간대']]
        
    return df_

In [14]:
df = processing_time_w_c(df)

In [10]:
df = processing_prime_time_w_c(df)

In [18]:
df[['취급액','time_w','prime_time_w','시간_w_c','프라임시간_w_c']].corr()

Unnamed: 0,취급액,time_w,prime_time_w,시간_w_c,프라임시간_w_c
취급액,1.0,0.158561,-0.038599,0.083067,0.115074
time_w,0.158561,1.0,0.381827,0.692447,0.788918
prime_time_w,-0.038599,0.381827,1.0,0.369615,0.641241
시간_w_c,0.083067,0.692447,0.369615,1.0,0.612603
프라임시간_w_c,0.115074,0.788918,0.641241,0.612603,1.0


In [17]:
df.columns

Index(['방송일시', '노출(분)', '마더코드', '상품명', '상품군', '판매단가', '취급액', '노출(분)_w',
       '상품명_n', '일시불_할인율', '무이자_일시불', '방송상품idx', '상품별_idx', '방송순서_비율',
       '방송순서_w', '시간대', '기온', '강수량', '풍속', '습도', '방송일시n', '요일', '계절', '휴일',
       '주말', '월', '일', '주차', '방송상품set_idx', 'word_key', '분당취급액', '브랜드_w',
       'kospi_일', 'kospi_7일_이동평균', 'kospi_28일_이동평균', 'kospi_84일_이동평균',
       'kospi_월평균', 'time_w', 'prime_time_w', '주중휴일별_시간별_시청률', '주중별_시간별_시청률',
       '주중휴일별_시간별_10분별_시청률', '주중휴일별_시간별_10분별__시청률', '방송set', '프라임시간_w_c',
       '시간_w_c'],
      dtype='object')