# 시계열로 끊어서 데이터 만들기

In [1]:
# 모듈 불러오기 및 설정
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
pd.set_option('display.max_rows', 500) 

from sklearn.preprocessing import StandardScaler, FunctionTransformer

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
%config InlineBackend.figure_format = 'retina'

In [2]:
# 데이터 경로 설정
RAW_DATA = './data/2020 빅콘테스트 데이터분석분야-챔피언리그_2019년 실적데이터_v1_200818.xlsx'

In [3]:
# 원본 데이터 로드
data_raw = pd.read_excel(RAW_DATA, header=1)
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38309 entries, 0 to 38308
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   방송일시    38309 non-null  datetime64[ns]
 1   노출(분)   21525 non-null  float64       
 2   마더코드    38309 non-null  int64         
 3   상품코드    38309 non-null  int64         
 4   상품명     38309 non-null  object        
 5   상품군     38309 non-null  object        
 6   판매단가    38309 non-null  int64         
 7   취급액     35379 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 2.3+ MB


In [4]:
# 무형 상품군 제거
data = data_raw[data_raw['상품군'] != '무형']
data = data.reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37372 entries, 0 to 37371
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   방송일시    37372 non-null  datetime64[ns]
 1   노출(분)   20588 non-null  float64       
 2   마더코드    37372 non-null  int64         
 3   상품코드    37372 non-null  int64         
 4   상품명     37372 non-null  object        
 5   상품군     37372 non-null  object        
 6   판매단가    37372 non-null  int64         
 7   취급액     35379 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 2.3+ MB


In [5]:
# 결측치 처리
data.isnull().sum() # 결측치: 노출(분) 16784, 취급액 1993
data['취급액'] = data['취급액'].fillna(0) # 취급액 없는 데이터: 안 팔린 데이터.
data['노출(분)'] = data['노출(분)'].fillna(method='ffill') # 노출(분) 없는 데이터: 이전 방송 이어서.
data.isnull().sum()

방송일시     0
노출(분)    0
마더코드     0
상품코드     0
상품명      0
상품군      0
판매단가     0
취급액      0
dtype: int64

## 집계 시 필요한 컬럼

In [6]:
# 컬럼 생성
data['연도'] = data['방송일시'].dt.year
data['월'] = data['방송일시'].dt.month
data['일'] = data['방송일시'].dt.day
data['방송시간대'] = data['방송일시'].dt.hour
data['요일'] = data['방송일시'].dt.dayofweek
data['주차'] = data['방송일시'].dt.week
data['판매량'] = data['취급액'] / data['판매단가']
data

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,연도,월,일,방송시간대,요일,주차,판매량
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019,1,1,6,1,1,52.606516
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019,1,1,6,1,1,109.548872
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,2019,1,1,6,1,1,81.754386
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,2019,1,1,6,1,1,174.310777
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,2019,1,1,6,1,1,167.218045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37367,2019-12-31 23:40:00,20.0,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,10157000.0,2019,12,31,23,1,1,68.628378
37368,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,50929000.0,2020,1,1,0,2,1,286.117978
37369,2020-01-01 00:00:00,20.0,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,104392000.0,2020,1,1,0,2,1,621.380952
37370,2020-01-01 00:00:00,20.0,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,13765000.0,2020,1,1,0,2,1,87.120253


In [7]:
# 컬럼 변환
day_mapping_dict = {0:'월', 1:'화', 2:'수', 3:'목', 4:'금', 5:'토', 6:'일'}
data['요일'] = data['요일'].map(day_mapping_dict)

## 1. Weekly Data

- 방송편성대 = 20시간
- 52*14 = 7280 - 1월 1일 화요일이므로 20 = 7260
- 제외한 데이터: `df_out_time`, `df_out_2020`, `df_out_53week`
    - 새벽 2시에 시작하는 방송
    - 2020년
    - 53주차, 1주차
- 행 방향(→) : 시간(06시부터 25시까지)
- 열 방향(↓) : 주차(2주차 ~ 52주차)
- 변환 format: standard scaling

In [29]:
# 컬럼 정리
df = data[['연도', '월', '일', '요일', '방송시간대', '주차', '노출(분)', '상품군', '마더코드', '상품코드', '상품명', '판매단가', '판매량', '취급액']]
df

Unnamed: 0,연도,월,일,요일,방송시간대,주차,노출(분),상품군,마더코드,상품코드,상품명,판매단가,판매량,취급액
0,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,52.606516,2099000.0
1,2019,1,1,화,6,1,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,109.548872,4371000.0
2,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,81.754386,3262000.0
3,2019,1,1,화,6,1,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,174.310777,6955000.0
4,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,167.218045,6672000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37367,2019,12,31,화,23,1,20.0,주방,100448,201391,일시불쿠첸압력밥솥 6인용,148000,68.628378,10157000.0
37368,2020,1,1,수,0,1,20.0,주방,100448,201383,무이자쿠첸압력밥솥 10인용,178000,286.117978,50929000.0
37369,2020,1,1,수,0,1,20.0,주방,100448,201390,일시불쿠첸압력밥솥 10인용,168000,621.380952,104392000.0
37370,2020,1,1,수,0,1,20.0,주방,100448,201384,무이자쿠첸압력밥솥 6인용,158000,87.120253,13765000.0


In [32]:
# 2시에 시작하는 데이터 확인 및 보존
df_out_time = df[df['방송시간대'].isin([2, 3, 4, 5])]
df = df[df['방송시간대'] != 2]

In [33]:
# 2020년도 데이터 확인 및 보존
df_out_2020 = df[df['연도'] == 2020]
df = df[df['연도'] != 2020]

In [34]:
# 53주차 데이터 변경
df_out_53week = df.loc[37201:]
df = df.loc[:37200]
len(df) + len(df_out_53week)

37298

In [35]:
# 인덱스 정렬
df = df.reset_index(drop=True)
df

Unnamed: 0,연도,월,일,요일,방송시간대,주차,노출(분),상품군,마더코드,상품코드,상품명,판매단가,판매량,취급액
0,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,52.606516,2099000.0
1,2019,1,1,화,6,1,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,109.548872,4371000.0
2,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,81.754386,3262000.0
3,2019,1,1,화,6,1,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,174.310777,6955000.0
4,2019,1,1,화,6,1,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,167.218045,6672000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37127,2019,12,29,일,23,52,20.0,생활용품,100182,200612,무이자 선일금고 이볼브 시리즈 EV-020,440000,0.000000,0.0
37128,2019,12,29,일,23,52,20.0,생활용품,100182,200615,일시불 선일금고 이볼브 시리즈 EV-040,450000,147.553333,66399000.0
37129,2019,12,29,일,23,52,20.0,생활용품,100182,200613,무이자 선일금고 이볼브 시리즈 EV-040,490000,34.891837,17097000.0
37130,2019,12,29,일,23,52,20.0,생활용품,100182,200614,일시불 선일금고 이볼브 시리즈 EV-020,400000,3.690000,1476000.0


In [36]:
# 월, 일, 방송시간대별 그룹핑
df_grouped = df.groupby(by=['연도', '월', '일', '방송시간대', '주차'], as_index=False).agg({'판매량':'sum',
                                                                                 '취급액':'sum'})
df_grouped

Unnamed: 0,연도,월,일,방송시간대,주차,판매량,취급액
0,2019,1,1,6,1,819.448622,32696000.0
1,2019,1,1,7,1,811.491525,47878000.0
2,2019,1,1,8,1,1665.041736,99736000.0
3,2019,1,1,9,1,1151.556962,90973000.0
4,2019,1,1,10,1,3250.037547,259678000.0
...,...,...,...,...,...,...,...
7020,2019,12,29,18,52,1047.477639,58554000.0
7021,2019,12,29,20,52,1561.617978,138984000.0
7022,2019,12,29,21,52,2721.769187,194270000.0
7023,2019,12,29,22,52,1166.225995,135301000.0


In [37]:
# 140시간 단위로 맞추기 위한 템플릿
years = df_grouped['연도'].unique()
months = df_grouped['월'].unique()
days = df_grouped['일'].unique()
times = sorted(list(df_grouped['방송시간대'].unique())) # 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
times = df_grouped['방송시간대'].unique()

template = []
for year in years:
    for month in months:
        for day in days:
            for time in times:
                template.append([year, month, day, time])
                
template_arr = np.array(template)
template_df = pd.DataFrame(data=template_arr, columns=['연도', '월', '일', '방송시간대'])

# 지워야 할 리스트
removal_list = [(2,29), (2,30), (2,31), (4, 31), (6, 31), (9, 31), (11, 31), (12, 30), (12, 31)]
template_df = template_df[~template_df[['월', '일']].apply(tuple, axis=1).isin(removal_list)]
template_df = template_df.reset_index(drop=True)
template_df # 7260개 ok.

Unnamed: 0,연도,월,일,방송시간대
0,2019,1,1,6
1,2019,1,1,7
2,2019,1,1,8
3,2019,1,1,9
4,2019,1,1,10
...,...,...,...,...
7255,2019,12,29,21
7256,2019,12,29,22
7257,2019,12,29,23
7258,2019,12,29,0


In [38]:
# merge 후 판매량, 취급액 합 구하기
df_merged = template_df.merge(df_grouped, how='left')
df_merged['주차'] = df_merged['주차'].fillna(method='ffill').astype(int)
df_merged[['판매량', '취급액']] = df_merged[['판매량', '취급액']].fillna(0)
df_merged

Unnamed: 0,연도,월,일,방송시간대,주차,판매량,취급액
0,2019,1,1,6,1,819.448622,32696000.0
1,2019,1,1,7,1,811.491525,47878000.0
2,2019,1,1,8,1,1665.041736,99736000.0
3,2019,1,1,9,1,1151.556962,90973000.0
4,2019,1,1,10,1,3250.037547,259678000.0
...,...,...,...,...,...,...,...
7255,2019,12,29,21,52,2721.769187,194270000.0
7256,2019,12,29,22,52,1166.225995,135301000.0
7257,2019,12,29,23,52,881.585866,263222000.0
7258,2019,12,29,0,52,0.000000,0.0


In [39]:
# 0시, 1시 24시, 25시로 변환
def change_time(x):
    if x == 0:
        return 24
    elif x == 1:
        return 25
    else:
        return x
    
df_merged['방송시간대'] = df_merged['방송시간대'].apply(lambda x: change_time(x))
df_merged

Unnamed: 0,연도,월,일,방송시간대,주차,판매량,취급액
0,2019,1,1,6,1,819.448622,32696000.0
1,2019,1,1,7,1,811.491525,47878000.0
2,2019,1,1,8,1,1665.041736,99736000.0
3,2019,1,1,9,1,1151.556962,90973000.0
4,2019,1,1,10,1,3250.037547,259678000.0
...,...,...,...,...,...,...,...
7255,2019,12,29,21,52,2721.769187,194270000.0
7256,2019,12,29,22,52,1166.225995,135301000.0
7257,2019,12,29,23,52,881.585866,263222000.0
7258,2019,12,29,24,52,0.000000,0.0


In [40]:
# 주차별 그룹핑
df_groups = dict(tuple(df_merged.groupby(by=['주차'])))

In [41]:
# 판매량 데이터 만들기
sales = {}

for k, v in df_groups.items():
    if k == 1: # 1주차 제외
        continue
        
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['판매량']])
    
    sales[k] = sc.transform(temp[['판매량']].values).reshape(1, -1)[0]

# 데이터프레임
sales_df = pd.DataFrame(sales).T
sales_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
2,-0.759168,-0.663886,-0.832838,0.590958,-0.829561,-0.87453,-0.78521,0.492374,-0.448146,0.653373,...,4.036517,1.971292,-0.612913,0.420313,-0.501942,-0.60478,-0.662728,-1.160934,-0.267397,-0.968037
3,-0.71681,-0.318614,-0.760334,-0.979391,-0.4729,0.368097,0.444649,2.026833,-0.445678,0.631722,...,2.742545,2.569627,0.06784,-0.297052,-0.864901,-1.10127,-1.135857,-0.566349,-0.027926,-0.717389
4,-0.788975,-0.664699,-0.372493,-0.454469,-0.076522,0.745464,0.404171,0.46408,-0.385358,1.251082,...,-0.494904,1.894846,0.346784,-0.336913,-1.166666,-0.947413,-1.124866,-0.919776,-0.318678,-0.801817
5,0.08449,-0.289506,-0.616129,-0.709482,-0.210942,1.012127,0.730185,1.173891,1.28083,-0.212824,...,-0.097964,0.560325,0.080327,-0.856968,-0.535774,-1.105549,-1.066501,-0.922287,-0.661442,-0.414638
6,-0.719102,-0.134147,-0.413037,-0.352782,-0.858008,1.571464,-0.339917,0.553901,0.077609,0.212674,...,4.353374,1.235357,-0.463885,-0.272934,-0.943642,-0.980141,-0.958064,0.097084,-0.464565,-0.622494
7,-0.911891,0.559215,-0.118128,-0.342818,0.128657,-0.85598,0.612666,-0.264279,-0.08024,0.057729,...,3.314849,-0.240421,0.572746,-0.761255,-0.909681,-0.970947,-0.968233,0.088012,-0.919075,-0.559384
8,-0.429971,0.083277,-0.387118,0.511785,-0.648058,1.770025,-0.824898,-0.405408,0.957296,0.006391,...,2.90777,0.271496,1.108378,-1.11173,-0.22951,1.58275,-0.687994,-1.019931,-0.577979,-0.468011
9,-0.602921,-0.470502,-0.523997,0.251166,-0.95795,-0.566256,-0.908877,-0.631396,0.91134,0.110724,...,1.507872,1.968119,2.048064,-0.925003,-1.095496,-0.988337,-1.18265,-0.139908,-0.432248,-1.112055
10,-0.824766,0.24162,-0.680029,0.355462,0.739934,0.254283,0.546531,-0.01641,0.230906,-0.615303,...,0.226476,2.410771,-0.576363,-1.23246,-0.956493,-0.601805,-0.520109,0.32306,-1.248164,-0.735704
11,-1.280825,-0.042773,0.455527,-0.539284,-0.640723,0.304753,-1.071184,0.054658,-0.291791,-0.530014,...,1.604094,-0.051836,-0.496497,-0.585476,-0.591432,-0.558625,-0.482619,0.344971,-0.443294,-1.121384


In [42]:
# 취급액 데이터 만들기
revenues = {}

for k, v in df_groups.items():
    if k == 1: # 1주차 제외
        continue
        
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['취급액']])
    
    revenues[k] = sc.transform(temp[['취급액']].values).reshape(1, -1)[0]

# 데이터프레임
revenue_df = pd.DataFrame(revenues).T
revenue_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
2,-1.858257,-1.161755,-1.49233,-0.517179,-1.155669,-0.115297,0.192553,-0.128742,-0.10962,0.556364,...,2.892405,0.843473,0.232566,3.311446,-0.653666,-0.548841,0.505948,-0.397614,0.611291,-1.352348
3,-1.610283,-0.927586,-0.403815,-0.991441,-0.27173,0.033633,0.222587,0.206811,-0.565078,0.449942,...,2.555257,1.525194,-0.060309,-0.896331,-0.492235,0.856512,0.801187,0.38951,-0.50623,-1.037265
4,-1.475594,-0.649904,-1.16377,-0.258047,-0.47791,0.851377,0.357989,-0.306834,-0.14331,-0.242623,...,0.787486,1.848402,0.182558,0.516678,0.390354,1.050183,-1.324157,0.903921,-0.379136,-1.489079
5,-1.34976,-0.700597,-1.322811,-0.104201,-0.03115,1.245112,0.782332,0.876293,-0.174967,0.122461,...,0.242746,0.405933,-0.335339,1.497232,-0.328337,0.538345,0.887073,0.266928,-1.272016,-1.366122
6,-1.560801,-1.301299,-0.978414,-0.404154,-0.816914,0.209886,-0.862491,-0.334623,-0.485204,-0.397346,...,2.690073,0.477999,0.364598,1.537343,-1.387683,0.098152,0.083839,0.184052,-1.07019,-1.360123
7,-1.50659,-0.991263,-0.795367,-0.93259,-0.236862,-0.569564,0.40603,-0.580281,-1.229507,-0.793291,...,1.487193,0.115322,0.300273,1.475729,0.293834,0.486137,1.159309,0.423953,-1.058867,-1.070456
8,-1.643825,-1.07988,-1.144471,-0.625282,-0.750782,0.317073,-0.149905,-1.155355,1.017195,-0.497273,...,1.916589,-0.075051,0.070146,1.881659,-0.089134,0.510892,0.600587,0.776297,-0.544422,-1.47732
9,-1.577805,-0.920522,-1.134224,-1.088677,-1.01615,-0.624509,-0.344345,-0.47449,-0.019839,-0.843562,...,1.307574,0.31833,1.149692,-0.095757,-0.311053,1.862632,0.683828,1.760264,1.992757,-1.584525
10,-1.360251,-0.949793,-0.668616,0.133466,-0.265727,-0.369581,0.406522,-0.368585,-0.715,-0.663482,...,0.972926,0.539144,-1.198197,-1.607946,-0.823611,2.266061,2.003473,3.223227,-2.158262,-1.671657
11,-1.659877,-1.062423,-0.362079,-0.956159,-0.071259,-0.177584,-0.745706,-0.434879,-0.613704,-0.611408,...,0.184658,0.217724,-1.057372,-0.704894,-0.413035,1.28039,1.501365,0.527552,-0.282491,-1.662533


In [None]:
# 데이터 저장
sales_df.to_csv("./data/weekly_sales_logscaled.csv", index=False, encoding='utf-8-sig')
revenue_df.to_csv("./data/weekly_revenue_logscaled.csv", index=False, encoding='utf-8-sig')

## 2. Daily Data

- 방송편성대 = 20시간
- 365*20 = 7300
- 제외한 데이터: `df_out_time`, `df_out_2020`
    - 새벽 2시에 시작하는 방송
    - 2020년 데이터
- 행 방향(→) : 시간(06시부터 25시까지)
- 열 방향(↓) : 일자(0일차~364일차)    
- 변환 format: standard scaling

In [8]:
# 컬럼 정리
df = data[['연도', '월', '일', '요일', '방송시간대', '노출(분)', '상품군', '마더코드', '상품코드', '상품명', '판매단가', '판매량', '취급액']]
df

Unnamed: 0,연도,월,일,요일,방송시간대,노출(분),상품군,마더코드,상품코드,상품명,판매단가,판매량,취급액
0,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,52.606516,2099000.0
1,2019,1,1,화,6,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,109.548872,4371000.0
2,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,81.754386,3262000.0
3,2019,1,1,화,6,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,174.310777,6955000.0
4,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,167.218045,6672000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37367,2019,12,31,화,23,20.0,주방,100448,201391,일시불쿠첸압력밥솥 6인용,148000,68.628378,10157000.0
37368,2020,1,1,수,0,20.0,주방,100448,201383,무이자쿠첸압력밥솥 10인용,178000,286.117978,50929000.0
37369,2020,1,1,수,0,20.0,주방,100448,201390,일시불쿠첸압력밥솥 10인용,168000,621.380952,104392000.0
37370,2020,1,1,수,0,20.0,주방,100448,201384,무이자쿠첸압력밥솥 6인용,158000,87.120253,13765000.0


In [9]:
# 2시에 시작하는 데이터 확인 및 보존
df_out_time = df[df['방송시간대'].isin([2, 3, 4, 5])]
df = df[df['방송시간대'] != 2]

In [10]:
# 2020년도 데이터 확인 및 보존
df_out_2020 = df[df['연도'] == 2020]
df = df[df['연도'] != 2020]

In [11]:
# 인덱스 정렬
df = df.reset_index(drop=True)
df

Unnamed: 0,연도,월,일,요일,방송시간대,노출(분),상품군,마더코드,상품코드,상품명,판매단가,판매량,취급액
0,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,52.606516,2099000.0
1,2019,1,1,화,6,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,109.548872,4371000.0
2,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,81.754386,3262000.0
3,2019,1,1,화,6,20.0,의류,100346,201079,테이트 여성 셀린니트3종,39900,174.310777,6955000.0
4,2019,1,1,화,6,20.0,의류,100346,201072,테이트 남성 셀린니트3종,39900,167.218045,6672000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37293,2019,12,31,화,23,20.0,주방,100448,201391,일시불쿠첸압력밥솥 6인용,148000,11.243243,1664000.0
37294,2019,12,31,화,23,20.0,주방,100448,201383,무이자쿠첸압력밥솥 10인용,178000,51.398876,9149000.0
37295,2019,12,31,화,23,20.0,주방,100448,201390,일시불쿠첸압력밥솥 10인용,168000,90.964286,15282000.0
37296,2019,12,31,화,23,20.0,주방,100448,201384,무이자쿠첸압력밥솥 6인용,158000,14.734177,2328000.0


In [12]:
# 월, 일, 방송시간대별 그룹핑
df_grouped = df.groupby(by=['연도', '월', '일', '방송시간대'], as_index=False).agg({'판매량':'sum',
                                                                           '취급액':'sum'})
df_grouped

Unnamed: 0,연도,월,일,방송시간대,판매량,취급액
0,2019,1,1,6,819.448622,32696000.0
1,2019,1,1,7,811.491525,47878000.0
2,2019,1,1,8,1665.041736,99736000.0
3,2019,1,1,9,1151.556962,90973000.0
4,2019,1,1,10,3250.037547,259678000.0
...,...,...,...,...,...,...
7060,2019,12,31,19,1716.880753,177729000.0
7061,2019,12,31,20,432.121912,143410000.0
7062,2019,12,31,21,496.086853,219941000.0
7063,2019,12,31,22,74.989813,104917000.0


In [13]:
# 140시간 단위로 맞추기 위한 템플릿
years = df_grouped['연도'].unique()
months = df_grouped['월'].unique()
days = df_grouped['일'].unique()
# times = sorted(list(df_grouped['방송시간대'].unique())) # 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
times = df_grouped['방송시간대'].unique()

template = []
for year in years:
    for month in months:
        for day in days:
            for time in times:
                template.append([year, month, day, time])
                
template_arr = np.array(template)
template_df = pd.DataFrame(data=template_arr, columns=['연도', '월', '일', '방송시간대'])

# 지워야 할 리스트
removal_list = [(2,29), (2,30), (2,31), (4, 31), (6, 31), (9, 31), (11, 31)]
template_df = template_df[~template_df[['월', '일']].apply(tuple, axis=1).isin(removal_list)]
template_df = template_df.reset_index(drop=True)
template_df # 7300개 ok.

Unnamed: 0,연도,월,일,방송시간대
0,2019,1,1,6
1,2019,1,1,7
2,2019,1,1,8
3,2019,1,1,9
4,2019,1,1,10
...,...,...,...,...
7295,2019,12,31,21
7296,2019,12,31,22
7297,2019,12,31,23
7298,2019,12,31,0


In [14]:
# merge 후 판매량, 취급액 합 구하기
df_merged = template_df.merge(df_grouped, how='left')
df_merged[['판매량', '취급액']] = df_merged[['판매량', '취급액']].fillna(0)
df_merged

Unnamed: 0,연도,월,일,방송시간대,판매량,취급액
0,2019,1,1,6,819.448622,32696000.0
1,2019,1,1,7,811.491525,47878000.0
2,2019,1,1,8,1665.041736,99736000.0
3,2019,1,1,9,1151.556962,90973000.0
4,2019,1,1,10,3250.037547,259678000.0
...,...,...,...,...,...,...
7295,2019,12,31,21,496.086853,219941000.0
7296,2019,12,31,22,74.989813,104917000.0
7297,2019,12,31,23,320.923309,52902000.0
7298,2019,12,31,0,1862.885367,108945000.0


In [15]:
# 0시, 1시 24시, 25시로 변환
def change_time(x):
    if x == 0:
        return 24
    elif x == 1:
        return 25
    else:
        return x
    
df_merged['방송시간대'] = df_merged['방송시간대'].apply(lambda x: change_time(x))
df_merged

Unnamed: 0,연도,월,일,방송시간대,판매량,취급액
0,2019,1,1,6,819.448622,32696000.0
1,2019,1,1,7,811.491525,47878000.0
2,2019,1,1,8,1665.041736,99736000.0
3,2019,1,1,9,1151.556962,90973000.0
4,2019,1,1,10,3250.037547,259678000.0
...,...,...,...,...,...,...
7295,2019,12,31,21,496.086853,219941000.0
7296,2019,12,31,22,74.989813,104917000.0
7297,2019,12,31,23,320.923309,52902000.0
7298,2019,12,31,24,1862.885367,108945000.0


In [16]:
# 일자 나타내는 column
df_merged['방송날짜'] = df_merged['연도'].astype(str) + df_merged['월'].astype(str).str.zfill(2) + df_merged['일'].astype(str).str.zfill(2)
df_merged

Unnamed: 0,연도,월,일,방송시간대,판매량,취급액,방송날짜
0,2019,1,1,6,819.448622,32696000.0,20190101
1,2019,1,1,7,811.491525,47878000.0,20190101
2,2019,1,1,8,1665.041736,99736000.0,20190101
3,2019,1,1,9,1151.556962,90973000.0,20190101
4,2019,1,1,10,3250.037547,259678000.0,20190101
...,...,...,...,...,...,...,...
7295,2019,12,31,21,496.086853,219941000.0,20191231
7296,2019,12,31,22,74.989813,104917000.0,20191231
7297,2019,12,31,23,320.923309,52902000.0,20191231
7298,2019,12,31,24,1862.885367,108945000.0,20191231


In [17]:
# 일자별 그룹핑
df_groups = dict(tuple(df_merged.groupby(by='방송날짜')))

In [18]:
# 판매량 데이터 만들기
sales = {}

for k, v in df_groups.items():
    
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['판매량']])
    
    sales[k] = sc.transform(temp[['판매량']].values).reshape(1, -1)[0]

# 데이터프레임
sales_df = pd.DataFrame(sales).T
sales_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
20190101,-0.558182,-0.562637,-0.084719,-0.372228,0.802748,-0.421362,0.749549,-0.444009,-0.452712,-0.22469,1.730645,2.117915,2.441504,-0.120295,-0.670204,-0.843403,-0.800096,-0.253817,-1.017005,-1.017005
20190102,-0.213012,-0.346151,-0.245761,0.368964,-0.393492,-0.95589,-0.761909,1.728712,-0.558498,3.100559,0.873803,0.642012,-0.262427,0.617367,0.005135,-1.169037,-1.164379,-0.082547,-0.491452,-0.691998
20190103,-0.86968,0.871526,-0.510426,0.630176,-0.543497,-0.38119,-0.11772,1.954528,-0.357669,-0.846287,1.085284,-0.1484,2.843755,-0.382076,-0.183493,-1.607612,-0.510095,-0.297812,-0.080347,-0.548964
20190104,-0.479885,0.041707,-0.69468,-0.952801,-0.746212,-0.6255,2.17986,-0.574205,-0.027896,0.771735,1.706309,-0.235234,0.719825,-0.979067,2.174583,-0.087354,-0.043041,-1.064166,-1.193844,0.109866
20190105,-0.747634,-0.797698,1.030358,0.457934,-0.48083,0.719357,0.062945,0.939764,-1.042723,-0.611608,1.004629,3.292061,-0.282357,-0.394599,-0.496391,-1.1749,-0.16385,-0.229322,-0.370975,-0.714158
20190106,-0.968722,-0.017209,-0.247795,-0.635352,0.053421,-1.056837,-0.583672,1.10788,1.824395,0.476621,1.525028,2.148267,-0.403068,-0.535641,-0.730014,-1.082463,-0.567751,-1.147302,1.215842,-0.375629
20190107,-0.60478,-0.518526,-0.67147,0.617423,-0.668504,-0.709212,-0.628354,0.528179,-0.323228,0.673924,2.299553,0.566145,2.927538,-0.408623,-0.116123,-0.751055,-0.192543,-0.813861,-0.592649,-0.613835
20190108,-0.833352,-0.30257,-0.639746,-0.457893,-0.724033,0.286294,1.797589,1.231454,0.712332,0.49723,0.047487,0.426625,-0.633859,-0.386222,2.804021,-1.314125,-1.246544,-0.25921,-0.370141,-0.635337
20190109,-1.061694,-0.555606,0.030383,-1.074708,-0.297293,-0.704363,0.656778,0.076259,1.844526,-0.45513,1.02349,0.914026,2.452729,-1.104663,-0.564571,-0.057854,-1.231899,0.676523,0.491097,-1.05803
20190110,-1.217927,-0.405097,0.170412,-0.892191,-0.906387,0.131526,0.187641,-0.454012,2.043208,-0.278065,1.447662,1.05077,0.185661,-0.65262,-0.373525,1.864279,-1.255972,-1.147131,1.22576,-0.723992


In [19]:
# 취급액 데이터 만들기
revenues = {}

for k, v in df_groups.items():
        
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['취급액']])
    
    revenues[k] = sc.transform(temp[['취급액']].values).reshape(1, -1)[0]

# 데이터프레임
revenue_df = pd.DataFrame(revenues).T
revenue_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
20190101,-1.206935,-1.028937,-0.420937,-0.523678,1.454273,-0.718454,0.662448,-0.282473,-0.184177,-0.579907,1.459654,1.022325,1.299242,0.644134,1.746161,0.223654,-0.232235,-0.153611,-1.590273,-1.590273
20190102,-1.86951,-0.847307,-0.945148,-0.757224,1.546495,-0.12584,0.695929,0.354832,-0.437825,1.564644,-0.718122,0.848587,0.014366,0.799609,0.778627,0.252355,1.453864,0.17229,-1.157775,-1.622847
20190103,-1.600036,-0.447079,-0.577038,-0.651282,-0.004346,-0.399657,-0.318037,0.297276,0.366631,0.986877,0.679122,-0.5773,0.999556,-0.1668,2.826008,1.590362,-0.345687,-0.511546,-1.106934,-1.040089
20190104,-0.963338,-0.762117,-1.219965,-0.349518,0.995123,-0.537472,0.709283,-0.315443,-0.468089,0.62885,0.398295,0.263655,1.185785,2.055063,1.814503,0.312352,-0.009964,-0.699507,-1.387849,-1.649649
20190105,-1.574757,-1.335667,-0.185116,0.038847,-0.800615,0.407108,0.429081,0.736881,0.256748,-0.588709,0.826212,1.743735,-0.875403,0.863882,-0.65802,2.353641,0.69442,-0.601883,-1.184539,-0.545846
20190106,-1.776913,-1.248806,-0.00454,-1.163255,-0.444712,-0.33504,0.589658,-0.027595,0.74151,0.234536,1.284498,2.052412,-0.407381,1.477694,0.857892,0.605209,-0.998904,0.01331,0.082621,-1.532194
20190107,-1.693193,-1.004195,-1.331208,-0.366565,-0.998174,0.030988,0.33552,0.017687,0.036603,0.695412,0.633541,0.513043,2.132674,1.202361,0.114593,1.812535,-0.048091,-0.105415,-0.399059,-1.579057
20190108,-1.634901,-1.310954,-0.922246,-0.621977,0.924494,-0.079299,0.035631,0.033933,0.566641,0.232036,0.603125,0.139108,0.438905,-0.270784,2.340371,2.15965,-0.405503,-0.254175,-0.699971,-1.274083
20190109,-1.954276,-1.091273,-1.181263,-0.532077,-0.287994,0.401954,0.17675,-0.26549,0.220487,0.193141,1.300119,1.190221,0.558563,1.338798,1.744332,0.645469,-1.146398,0.726641,-0.637863,-1.39984
20190110,-1.525131,-1.423211,-0.822521,-1.087049,0.282594,0.271852,0.049379,-0.292394,0.331125,-0.95117,0.584045,0.789892,0.001759,0.50208,-0.358645,1.788754,2.491957,0.351833,0.27224,-1.257388


In [20]:
# 데이터 저장
sales_df.to_csv("./data/daily_sales_stdscaled.csv", index=False, encoding='utf-8-sig')
revenue_df.to_csv("./data/daily_revenue_stdscaled.csv", index=False, encoding='utf-8-sig')

## 3. 일주일-시간대별로
- 2019년 1월 1일: 화요일
- 2020년 1월 1일: 수요일

In [8]:
# 방송시간대
def change_time(x):
    if x == 0:
        return 24
    elif x == 1:
        return 25
    elif x == 2:
        return 26
    else:
        return x
    
data['방송시간대'] = data['방송시간대'].apply(lambda x: change_time(x))
data

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,연도,월,일,방송시간대,요일,주차,판매량
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019,1,1,6,화,1,52.606516
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019,1,1,6,화,1,109.548872
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,2019,1,1,6,화,1,81.754386
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,2019,1,1,6,화,1,174.310777
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,2019,1,1,6,화,1,167.218045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37367,2019-12-31 23:40:00,20.0,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,10157000.0,2019,12,31,23,화,1,68.628378
37368,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,50929000.0,2020,1,1,24,수,1,286.117978
37369,2020-01-01 00:00:00,20.0,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,104392000.0,2020,1,1,24,수,1,621.380952
37370,2020-01-01 00:00:00,20.0,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,13765000.0,2020,1,1,24,수,1,87.120253


In [9]:
# 26시 제외
data = data[data['방송시간대'] != 26]
data = data.reset_index(drop=True)
data

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,연도,월,일,방송시간대,요일,주차,판매량
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019,1,1,6,화,1,52.606516
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019,1,1,6,화,1,109.548872
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,2019,1,1,6,화,1,81.754386
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,2019,1,1,6,화,1,174.310777
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,2019,1,1,6,화,1,167.218045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37297,2019-12-31 23:40:00,20.0,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,10157000.0,2019,12,31,23,화,1,68.628378
37298,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,50929000.0,2020,1,1,24,수,1,286.117978
37299,2020-01-01 00:00:00,20.0,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,104392000.0,2020,1,1,24,수,1,621.380952
37300,2020-01-01 00:00:00,20.0,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,13765000.0,2020,1,1,24,수,1,87.120253


In [10]:
# 요일별 방송시간대 컬럼 생성
data['일별방송시간대'] = list(zip(data['요일'], data['방송시간대'].astype(str).str.zfill(2)))
data['방송일자'] = data['연도'].astype(str) + data['월'].astype(str).str.zfill(2) + data['일'].astype(str).str.zfill(2)
data

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액,연도,월,일,방송시간대,요일,주차,판매량,일별방송시간대,방송일자
0,2019-01-01 06:00:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,2099000.0,2019,1,1,6,화,1,52.606516,"(화, 06)",20190101
1,2019-01-01 06:00:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,4371000.0,2019,1,1,6,화,1,109.548872,"(화, 06)",20190101
2,2019-01-01 06:20:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,3262000.0,2019,1,1,6,화,1,81.754386,"(화, 06)",20190101
3,2019-01-01 06:20:00,20.0,100346,201079,테이트 여성 셀린니트3종,의류,39900,6955000.0,2019,1,1,6,화,1,174.310777,"(화, 06)",20190101
4,2019-01-01 06:40:00,20.0,100346,201072,테이트 남성 셀린니트3종,의류,39900,6672000.0,2019,1,1,6,화,1,167.218045,"(화, 06)",20190101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37297,2019-12-31 23:40:00,20.0,100448,201391,일시불쿠첸압력밥솥 6인용,주방,148000,10157000.0,2019,12,31,23,화,1,68.628378,"(화, 23)",20191231
37298,2020-01-01 00:00:00,20.0,100448,201383,무이자쿠첸압력밥솥 10인용,주방,178000,50929000.0,2020,1,1,24,수,1,286.117978,"(수, 24)",20200101
37299,2020-01-01 00:00:00,20.0,100448,201390,일시불쿠첸압력밥솥 10인용,주방,168000,104392000.0,2020,1,1,24,수,1,621.380952,"(수, 24)",20200101
37300,2020-01-01 00:00:00,20.0,100448,201384,무이자쿠첸압력밥솥 6인용,주방,158000,13765000.0,2020,1,1,24,수,1,87.120253,"(수, 24)",20200101


In [11]:
# 사용할 컬럼
df = data[['방송일자', '방송시간대', '판매량', '취급액']]
df

Unnamed: 0,방송일자,방송시간대,판매량,취급액
0,20190101,6,52.606516,2099000.0
1,20190101,6,109.548872,4371000.0
2,20190101,6,81.754386,3262000.0
3,20190101,6,174.310777,6955000.0
4,20190101,6,167.218045,6672000.0
...,...,...,...,...
37297,20191231,23,68.628378,10157000.0
37298,20200101,24,286.117978,50929000.0
37299,20200101,24,621.380952,104392000.0
37300,20200101,24,87.120253,13765000.0


In [12]:
# 방송일자, 시간대별 그룹핑
df_grouped = df.groupby(by=['방송일자', '방송시간대'], as_index=False).agg({'판매량':'sum', '취급액':'sum'})
df_grouped

Unnamed: 0,방송일자,방송시간대,판매량,취급액
0,20190101,6,819.448622,32696000.0
1,20190101,7,811.491525,47878000.0
2,20190101,8,1665.041736,99736000.0
3,20190101,9,1151.556962,90973000.0
4,20190101,10,3250.037547,259678000.0
...,...,...,...,...
7061,20191231,22,74.989813,104917000.0
7062,20191231,23,320.923309,52902000.0
7063,20191231,24,1862.885367,108945000.0
7064,20191231,25,932.486457,55471000.0


In [13]:
# 140시간 단위로 맞추기 위한 템플릿: 1주, 53주 제외
dates = df_grouped['방송일자'].unique()
times = df['방송시간대'].unique() # 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25

template = []
for date in dates:
    for time in times:
        template.append([date, time])
                
template_arr = np.array(template)
template_df = pd.DataFrame(data=template_arr, columns=['방송일자', '방송시간대'])

# 지워야 할 리스트
removal_list = ['20190101', '20190102', '20190103', '20190104', '20190105', '20191229', '20191230', '20191231', '20200101']
template_df = template_df[~template_df['방송일자'].isin(removal_list)]
template_df = template_df.reset_index(drop=True)
template_df # 7140개 ok.

Unnamed: 0,방송일자,방송시간대
0,20190106,6
1,20190106,7
2,20190106,8
3,20190106,9
4,20190106,10
...,...,...
7135,20191228,21
7136,20191228,22
7137,20191228,23
7138,20191228,24


In [31]:
# merge 후 판매량, 취급액 합 구하기
df_grouped['방송일자'] = df_grouped['방송일자'].astype(str)
df_grouped['방송시간대'] = df_grouped['방송시간대'].astype(str)
df_merged = template_df.merge(df_grouped, how='left')
df_merged[['판매량', '취급액']] = df_merged[['판매량', '취급액']].fillna(0)
df_merged

Unnamed: 0,방송일자,방송시간대,판매량,취급액
0,20190106,6,491.775281,43768000.0
1,20190106,7,1730.175439,69034000.0
2,20190106,8,1430.066741,128563000.0
3,20190106,9,925.658228,73127000.0
4,20190106,10,1822.101695,107504000.0
...,...,...,...,...
7135,20191228,21,1074.370884,187574000.0
7136,20191228,22,3424.625392,218523000.0
7137,20191228,23,2173.607214,108463000.0
7138,20191228,24,0.000000,0.0


In [32]:
# 방송요일별 방송시간대
df_merged['방송일자'] = pd.to_datetime(df_merged['방송일자'], format='%Y-%m-%d')
df_merged['요일'] = df_merged['방송일자'].dt.weekday
df_merged['요일'] = df_merged['요일'].map(day_mapping_dict)
df_merged['일별 방송시간대'] = list(zip(df_merged['요일'], df_merged['방송시간대'].astype(str).str.zfill(2)))
df_merged

Unnamed: 0,방송일자,방송시간대,판매량,취급액,요일,일별 방송시간대
0,2019-01-06,6,491.775281,43768000.0,일,"(일, 06)"
1,2019-01-06,7,1730.175439,69034000.0,일,"(일, 07)"
2,2019-01-06,8,1430.066741,128563000.0,일,"(일, 08)"
3,2019-01-06,9,925.658228,73127000.0,일,"(일, 09)"
4,2019-01-06,10,1822.101695,107504000.0,일,"(일, 10)"
...,...,...,...,...,...,...
7135,2019-12-28,21,1074.370884,187574000.0,토,"(토, 21)"
7136,2019-12-28,22,3424.625392,218523000.0,토,"(토, 22)"
7137,2019-12-28,23,2173.607214,108463000.0,토,"(토, 23)"
7138,2019-12-28,24,0.000000,0.0,토,"(토, 24)"


In [33]:
# 요일별 방송시간대별 취급액, 판매량 합계
df_groups = dict(tuple(df_merged.groupby(by=['일별 방송시간대'])))

In [53]:
# 판매량 데이터 만들기
sales = {}

for k, v in df_groups.items():
    
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['판매량']])
    
    sales[k] = sc.transform(temp[['판매량']].values).reshape(1, -1)[0]

# 데이터프레임
sales_df = pd.DataFrame(sales).T
sales_df = sales_df.reset_index()
sales_df = sales_df.rename(columns={'level_0':'요일', 'level_1':'시간'})
sales_df['정렬용'] = sales_df['요일'].map({'월':0, '화':1, '수':2, '목':3, '금':4, '토':5, '일':6})
sales_df = sales_df.sort_values(by=['정렬용', '시간'])
sales_df = sales_df.drop('정렬용', axis=1)
sales_df = sales_df.reset_index(drop=True)
sales_df

Unnamed: 0,요일,시간,0,1,2,3,4,5,6,7,...,41,42,43,44,45,46,47,48,49,50
0,월,6,-0.501494,-0.498026,-0.552724,1.548855,-1.015856,-1.469222,0.364886,0.091875,...,1.253368,1.133067,-0.735596,-1.814096,-0.877551,0.250926,-1.394221,-0.104398,-0.842608,-0.708178
1,월,7,-0.905677,-0.435702,-0.862788,-0.544783,-0.463684,0.75524,0.126222,-0.538162,...,0.005236,-0.561854,-0.654896,-1.11628,-0.299117,0.407116,0.009082,1.53333,-0.623362,-1.197115
2,월,8,-1.111913,-1.063972,-0.210624,-0.943639,-0.775507,-0.105609,-0.405546,-0.49282,...,-0.905323,1.045554,0.125351,0.009159,-0.05413,0.307653,-0.478871,1.775539,0.454512,-1.442079
3,월,9,0.751986,-1.479577,-0.593859,-1.18799,-0.842017,-0.713728,0.616996,0.474184,...,0.001791,0.664309,0.14471,-0.358999,1.66761,-0.854077,-0.368623,2.090713,-0.355218,-0.117251
4,월,10,-0.989211,-0.634619,-0.053929,-0.457798,-1.255416,-0.065025,-0.824285,-1.109483,...,0.445944,1.257533,0.468697,0.310618,0.088864,0.492092,-0.366601,1.641463,-0.718739,0.588499
5,월,11,-1.320053,-0.063759,0.523721,0.40672,0.881211,-1.480641,1.28061,-0.961174,...,-0.429573,-0.602931,0.166989,1.071745,1.536132,0.566753,-0.822674,-0.086872,0.94891,0.889797
6,월,12,-0.959382,0.552757,0.695971,0.692264,-0.68948,0.650372,-1.040993,-1.082293,...,-0.915367,-1.543899,0.66762,1.638824,1.165758,-0.587452,0.854983,-1.202653,0.296408,0.012165
7,월,13,0.743122,3.052472,0.985944,1.490923,0.526808,-0.479766,-0.531946,-0.741342,...,-0.781037,1.749295,1.379389,-0.084403,0.593623,-0.82337,0.016561,-0.640489,0.779235,1.226478
8,월,14,-0.706255,-0.733133,-0.531147,1.329051,-0.312916,-0.379188,1.074606,1.296992,...,0.957112,1.368717,-0.155497,-1.029803,2.385256,0.539402,-0.526736,0.35218,0.445674,-0.067264
9,월,15,0.359242,0.384641,1.286089,-0.633337,-0.303191,-0.340193,-0.313631,-0.062343,...,1.106422,-0.204728,-0.732719,-0.217263,0.208295,-0.842099,-0.509572,-0.5629,-0.410003,-1.36998


In [54]:
# 취급액 데이터 만들기
revenues = {}

for k, v in df_groups.items():
    
    temp = df_groups[k]
    temp = temp.reset_index(drop=True)
    
    # 표준화
    sc = StandardScaler()
    sc.fit(temp[['취급액']])
    
    revenues[k] = sc.transform(temp[['취급액']].values).reshape(1, -1)[0]

# 데이터프레임
revenues_df = pd.DataFrame(revenues).T
revenues_df = revenues_df.reset_index()
revenues_df = revenues_df.rename(columns={'level_0':'요일', 'level_1':'시간'})
revenues_df['정렬용'] = revenues_df['요일'].map({'월':0, '화':1, '수':2, '목':3, '금':4, '토':5, '일':6})
revenues_df = revenues_df.sort_values(by=['정렬용', '시간'])
revenues_df = revenues_df.drop('정렬용', axis=1)
revenues_df = revenues_df.reset_index(drop=True)
revenues_df

Unnamed: 0,요일,시간,0,1,2,3,4,5,6,7,...,41,42,43,44,45,46,47,48,49,50
0,월,6,-1.340532,-0.873348,-0.267955,-0.073766,-0.963535,-0.477866,-0.447409,-0.679751,...,2.173548,1.985937,-0.928276,1.37515,-0.860389,-0.512007,-1.344611,0.056088,-1.347637,-0.885517
1,월,7,-0.592194,-0.166277,0.56683,0.04595,-1.193642,-0.333811,-0.326544,-0.137304,...,0.944397,0.408436,-0.146764,-1.668091,-0.027077,0.694925,0.791944,2.43995,-0.795693,-1.865603
2,월,8,-1.682284,0.791315,-0.958183,-1.516629,-0.772003,-0.210508,-0.795287,-0.965445,...,-1.09978,-0.171732,0.686703,0.037975,0.328477,0.633163,-0.005496,1.962133,0.950193,-1.864253
3,월,9,-0.424669,-1.428098,0.213132,-0.049671,-0.288329,-1.327426,-0.606048,-1.612685,...,-0.254899,1.205502,0.069447,-0.652438,1.707554,-0.20222,0.488724,0.828055,-0.490476,0.292867
4,월,10,-1.691458,-0.376467,-0.64098,-0.397296,-1.331872,-0.39134,-1.073033,-1.535862,...,0.345481,0.198865,0.390383,0.791191,0.16045,2.070417,-0.763431,1.091782,-1.26954,0.517666
5,월,11,-0.216526,0.072177,1.382186,1.207625,0.283463,-0.908902,0.337737,-0.955339,...,-0.662026,-0.667036,-0.666913,0.365335,2.387205,1.796831,-1.091238,1.778269,0.247998,0.866339
6,월,12,0.891387,1.036668,1.297156,1.290414,-0.764218,1.207675,0.378024,0.127479,...,-1.483727,-2.776874,-0.485524,0.6986,1.294142,0.372525,0.458463,-0.82847,0.565747,-0.601351
7,월,13,0.272741,0.864752,0.104596,1.270729,-0.080048,-0.434604,-1.157314,-0.234281,...,-0.670134,-0.010393,-0.171061,-0.499213,0.340977,1.818889,-1.190785,-1.266624,1.34354,0.444893
8,월,14,0.22837,-0.383521,0.285426,-0.146629,-0.346343,-1.352851,1.625032,0.382009,...,-0.181605,-0.364773,-0.237938,-1.384842,4.439596,-0.180794,-0.541942,0.883814,0.420926,0.222255
9,월,15,0.841508,0.82184,-0.116708,-0.048167,-0.464819,-0.981594,-0.530332,-1.023024,...,-0.391608,1.894217,0.465268,1.136218,3.360294,-0.149649,0.812134,0.622839,-0.041743,-1.445392


In [56]:
# 데이터 저장
sales_df.to_csv("./data/hourly_sales_stdscaled.csv", index=False, encoding='utf-8-sig')
revenues_df.to_csv("./data/hourly_revenue_stdscaled.csv", index=False, encoding='utf-8-sig')