In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)
import seaborn as sns
import plotly.express as px
import os
import missingno as msno
import pickle
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import matplotlib

import tensorflow as tf
from keras.layers import *
from keras import *
import kerastuner as kt

from sklearn.metrics import mean_squared_error, mean_absolute_error

from fbprophet import Prophet

In [None]:
orders = pd.read_csv('orders.csv')
deliveries = pd.read_csv('deliveries.csv')

def beg_end_month(x):
    if x<=10:
        return '월초'
    elif 10<x<=20:
        return '월중'
    elif 20<x<=31:
        return '월말'
orders.BKG_DATE = pd.to_datetime(orders.BKG_DATE, format='%Y-%m-%d')
orders.INS_DATE = pd.to_datetime(orders.INS_DATE, format='%Y-%m-%d')
orders["BKG_TIME"] = pd.to_datetime(orders["BKG_TIME"], format='%Y-%m-%d %H:%M:%S')
orders['BKG_WEEK'] = orders.BKG_DATE.dt.week
orders['BKG_MONTH2'] = orders.BKG_DATE.dt.day.map(beg_end_month)

In [None]:
data = orders[orders.BKG_TYP==7][orders.CORP_ID=='KX007'].groupby(['BKG_DATE','BKG_HOUR'])['ITEM_QTY'].sum().reset_index()
data = data.append(pd.DataFrame(dict(zip(['BKG_DATE','BKG_HOUR','ITEM_QTY'],[(pd.to_datetime('2021-06-28'),pd.to_datetime('2021-06-28')), (4,5), (0,0)]))))
data = data.sort_values(['BKG_DATE','BKG_HOUR'])

comb_date = []
for date, hour, _ in data.values:
    comb_date.append(pd.to_datetime(f'{date.year}-{date.month}-{date.day} {hour}:00:00'))
data['DATE'] = comb_date
data = data.set_index('DATE').drop(columns=['BKG_DATE', 'BKG_HOUR'])

train = data.iloc[:-720].values.reshape(-1,1)
test = data.iloc[-720:].values.reshape(-1,1)

In [None]:
fig, ax = plt.subplots(figsize=(20,4))
data.iloc[:-720].plot(label='train', ax=ax)
data.iloc[-720:].plot(label='test', ax=ax)
plt.legend()
plt.show()

## Preprocessing

window_size 결정

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4))
data.plot.box(ax=ax[0])
sns.kdeplot(data.values.flatten(), ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
input_window_list = []
for std_day in tqdm(range(1,30)):
    input_window = 24*std_day
    output_window = 1
    window = 1
    num_features = 1

    train_window_x = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, input_window, num_features)) # batch_size, input, feature
    train_window_y = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, output_window, num_features))
    for start in range(0,(train.shape[0]-(input_window+output_window))-24, window):
        end = start+input_window
        train_window_x[start//window,:] = train[start:end,:]
        train_window_y[start//window,:] = train[end+24:end+output_window+24,:]

    tf.random.set_seed(0)

    model = Sequential()
    model.add(LSTM(64))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse', metrics=['mse'])

    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
    model.fit(train_window_x, train_window_y, epochs=2000, batch_size=128, validation_split = 0.2, callbacks=[early_stop], verbose=0)

    val_window_x = np.zeros((((test.shape[0], input_window, num_features)))) # batch_size, input, feature
    for start in range(0, test.shape[0], window):
        end = start+input_window
        val_window_x[start//window,:] = np.concatenate([train[-24*std_day:],test])[start:end,:]

    prediction = model.predict(val_window_x)

    score = np.sqrt(mean_squared_error(prediction,test))

    input_window_list.append((std_day, score))

In [None]:
input_window_list

In [None]:
input_window_list = [(1, 630.3565918627247),
 (2, 618.2917720901166),
 (3, 644.753539938296),
 (4, 664.6092032459469),
 (5, 609.4356420500078),
 (6, 668.0701105989732),
 (7, 641.4371841531075),
 (8, 615.6381031873751),
 (9, 603.866006126268),
 (10, 592.8493443090897),
 (11, 635.9251034120703),
 (12, 591.6505388969802),
 (13, 603.0845910478097),
 (14, 628.9154092555275),
 (15, 661.7995279027178),
 (16, 713.4078110345258),
 (17, 678.7557748251144),
 (18, 599.0965027009323),
 (19, 614.0482555371441),
 (20, 758.2445137053238),
 (21, 583.3052620637433),
 (22, 603.5257562730226),
 (23, 607.9563339463683),
 (24, 628.5450493109005),
 (25, 600.843458871021),
 (26, 630.531406379209),
 (27, 697.6566132646157),
 (28, 613.8002691149866),
 (29, 583.7368838752573)]

In [None]:
idx = [i for i, score in input_window_list]
loss = [score for i, score in input_window_list]

In [None]:
plt.figure(figsize=(10,5))
plt.bar(idx,loss, color='skyblue')
plt.plot(idx,loss, color='k')
plt.xticks(idx)
plt.ylim((500,800))
plt.text(*input_window_list[np.argmin([score for i, score in input_window_list])],input_window_list[np.argmin([score for i, score in input_window_list])][1],
        ha='center', va='top')
plt.show()

In [None]:
input_window_list[np.argmin([score for i, score in input_window_list])]

이상치 처리

In [None]:
data2 = data.copy()
data2.loc[data2.ITEM_QTY > data.ITEM_QTY.mean() + 3*data.ITEM_QTY.std(),'ITEM_QTY'] = np.ceil(data.ITEM_QTY.mean() + 3*data.ITEM_QTY.std())
data2.loc[data2.ITEM_QTY < data.ITEM_QTY.mean() - 3*data.ITEM_QTY.std(),'ITEM_QTY'] = np.ceil(data.ITEM_QTY.mean() - 3*data.ITEM_QTY.std())

train = data2.iloc[:-720].values.reshape(-1,1)
test = data.iloc[-720:].values.reshape(-1,1)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4))
data2.plot.box(ax=ax[0])
sns.kdeplot(data2.ITEM_QTY, ax=ax[1])
plt.tight_layout()
plt.show()

In [None]:
std_day = 21
input_window = 24*std_day
output_window = 1
window = 1
num_features = 1

train_window_x = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, input_window, num_features)) # batch_size, input, feature
train_window_y = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, output_window, num_features))
for start in range(0,(train.shape[0]-(input_window+output_window))-24, window):
    end = start+input_window
    train_window_x[start//window,:] = train[start:end,:]
    train_window_y[start//window,:] = train[end+24:end+output_window+24,:]

tf.random.set_seed(0)

model = Sequential()
model.add(LSTM(64))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=['mse'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
model.fit(train_window_x, train_window_y, epochs=3000, batch_size=128, validation_split = 0.2, callbacks=[early_stop])

In [None]:
val_window_x = np.zeros((((test.shape[0], input_window, num_features)))) # batch_size, input, feature
for start in range(0, test.shape[0], window):
    end = start+input_window
    val_window_x[start//window,:] = np.concatenate([data[-720-24*std_day:],test])[start:end,:]

prediction = model.predict(val_window_x)

score = np.sqrt(mean_squared_error(prediction,test))
score

In [None]:
plt.plot(test,label='true', color='pink')
plt.plot(prediction, label='pred', color='k')
plt.legend()
plt.show()

로그 변환

In [None]:
data2 = data.copy()
data2 = np.log1p(data2)

train = data2.iloc[:-720].values.reshape(-1,1)
test = data2.iloc[-720:].values.reshape(-1,1)

In [None]:
std_day = 21
input_window = 24*std_day
output_window = 1
window = 1
num_features = 1

train_window_x = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, input_window, num_features)) # batch_size, input, feature
train_window_y = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, output_window, num_features))
for start in range(0,(train.shape[0]-(input_window+output_window))-24, window):
    end = start+input_window
    train_window_x[start//window,:] = train[start:end,:]
    train_window_y[start//window,:] = train[end+24:end+output_window+24,:]

tf.random.set_seed(0)

model = Sequential()
model.add(LSTM(64))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=['mse'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
model.fit(train_window_x, train_window_y, epochs=2000, batch_size=128, validation_split = 0.2, callbacks=[early_stop])

In [None]:
val_window_x = np.zeros((((test.shape[0], input_window, num_features)))) # batch_size, input, feature
for start in range(0, test.shape[0], window):
    end = start+input_window
    val_window_x[start//window,:] = np.concatenate([train[-24*std_day:],test])[start:end,:]

prediction = model.predict(val_window_x)

score = np.sqrt(mean_squared_error(np.expm1(prediction),np.expm1(test)))
score

이상치 처리 + 로그변환

In [None]:
data2 = data.copy()
data2.loc[data2.ITEM_QTY > data.ITEM_QTY.mean() + 3*data.ITEM_QTY.std(),'ITEM_QTY'] = np.ceil(data.ITEM_QTY.mean() + 3*data.ITEM_QTY.std())
data2.loc[data2.ITEM_QTY < data.ITEM_QTY.mean() - 3*data.ITEM_QTY.std(),'ITEM_QTY'] = np.ceil(data.ITEM_QTY.mean() - 3*data.ITEM_QTY.std())
data2 = np.log1p(data2)

train = data2.iloc[:-720].values.reshape(-1,1)
test = np.log1p(data.iloc[-720:].values).reshape(-1,1)

In [None]:
std_day = 21
input_window = 24*std_day
output_window = 1
window = 1
num_features = 1

train_window_x = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, input_window, num_features)) # batch_size, input, feature
train_window_y = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, output_window, num_features))
for start in range(0,(train.shape[0]-(input_window+output_window))-24, window):
    end = start+input_window
    train_window_x[start//window,:] = train[start:end,:]
    train_window_y[start//window,:] = train[end+24:end+output_window+24,:]

tf.random.set_seed(0)

model = Sequential()
model.add(LSTM(64))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=['mse'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
model.fit(train_window_x, train_window_y, epochs=2000, batch_size=128, validation_split = 0.2, callbacks=[early_stop])

In [None]:
val_window_x = np.zeros((((test.shape[0], input_window, num_features)))) # batch_size, input, feature
for start in range(0, test.shape[0], window):
    end = start+input_window
    val_window_x[start//window,:] = np.concatenate([train[-24*std_day:],test])[start:end,:]

prediction = model.predict(val_window_x)

score = np.sqrt(mean_squared_error(np.expm1(prediction),np.expm1(test)))
score

스케일링

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer

In [None]:
ss = StandardScaler()
mm = MinMaxScaler()
ma = MaxAbsScaler()
rb = RobustScaler()
pt = PowerTransformer()
scalers = [ss, mm, ma, rb, pt]
scaler_results = []
for scaler in tqdm(scalers):
    data2 = data.copy()

    train = data2.iloc[:-720].values.reshape(-1,1)
    test = data2.iloc[-720:].values.reshape(-1,1)

    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)

    fig, ax = plt.subplots(1,2, figsize=(10,4))
    pd.DataFrame(train_scaled).plot.box(ax=ax[0])
    sns.kdeplot(train_scaled.flatten(), ax=ax[1])
    plt.tight_layout()
    plt.show()

    std_day = 21
    input_window = 24*std_day
    output_window = 1
    window = 1
    num_features = 1

    train_window_x = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, input_window, num_features)) # batch_size, input, feature
    train_window_y = np.zeros((((train.shape[0]-(input_window+output_window))//window)-24, output_window, num_features))
    for start in range(0,(train.shape[0]-(input_window+output_window))-24, window):
        end = start+input_window
        train_window_x[start//window,:] = train_scaled[start:end,:]
        train_window_y[start//window,:] = train[end+24:end+output_window+24,:]

    tf.random.set_seed(0)

    model = Sequential()
    model.add(LSTM(64))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mse', metrics=['mse'])

    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
    model.fit(train_window_x, train_window_y, epochs=2000, batch_size=128, validation_split = 0.2, callbacks=[early_stop], verbose=0)

    val_window_x = np.zeros((((test.shape[0], input_window, num_features)))) # batch_size, input, feature
    for start in range(0, test.shape[0], window):
        end = start+input_window
        val_window_x[start//window,:] = np.concatenate([train_scaled[-24*std_day:],test_scaled])[start:end,:]

    prediction = model.predict(val_window_x)

    score = np.sqrt(mean_squared_error(prediction,test))

    scaler_results.append((scaler,score))

In [None]:
scaler_results

최종적으로 input_window_size는 $21*24 = 504$로 설정. 전처리는 미 시행하며 scaling만 robustscaler을 사용해 시행