In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import glob
import os
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from tensorflow.keras import layers, models, optimizers
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

In [4]:
# gpu 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [5]:
plt.rcParams["font.family"] = 'NanumGothic'
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/public_data/train.csv')

In [6]:
# 평가식
def nmae(y_true, y_pred):
    score = np.mean(np.abs(y_true - y_pred) / y_true)
    return score

In [7]:
# 정규화
def normalize(data, col):
    
    # 요일은 원핫 인코딩을 적용함, 요일에 대한 의미가 없을 것이라고 판단
    data = pd.concat([data, pd.get_dummies(data['요일'])], axis = 1)
    data = data.drop(['요일'], axis = 1)

    col1 = data.columns[-7:].to_list()
    col2 = data.columns[1:-7].to_list()

    new_col = ['date'] + col1 + col2
    data = data[new_col]
    data = data.drop(['date'], axis = 1)
    # 0 ~ 1 값으로 정규화 진행
    norm = data.iloc[:,8:].max(0)
    data.iloc[:,8:] = data.iloc[:,8:]/norm
    
    train = data.iloc[:, col:col + 2]
    
    return train, norm

In [8]:
# train & test 분리
def load_data(data, window_size = 28, future_size = 28, train_size = 0.9):
        x = []; y = []
        for i in range(len(data) - window_size - future_size):
            x.append(data.iloc[i: i+window_size])
            y.append(data.iloc[i+window_size:i+window_size+future_size, 1]) # 가격만

        x = np.array(x)
        y = np.array(y)

        train_idx = round(len(x) * train_size)

        train_x = x[:train_idx]
        train_y = y[:train_idx]

        valid_x = x[train_idx:]
        valid_y = y[train_idx:]
        
        return train_x, train_y, valid_x, valid_y

In [37]:
# 모델 구축
def build_model():
    model = Sequential()
    for i in range(2):
        model.add(LSTM(28,stateful=True, return_sequences = True, batch_input_shape = (1,28, 2)))
        model.add(Dropout(0.3))
    model.add(LSTM(28,stateful=True, return_sequences = False))
    model.add(Dropout(0.3))
    model.add(Dense(28))
        
    model.compile(loss='mse', optimizer='adam')

    return model



In [38]:
window_size = 28
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/sample_submission.csv')
public_date_list = submission[submission['예측대상일자'].str.contains('2020')]['예측대상일자'].str.split('+').str[0].unique()

In [39]:
for n, col in enumerate(list(range(7, 49, 2))):

    data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/public_data/train.csv')
    train, norm = normalize(data, col)
    
    train_x, train_y, valid_x, valid_y = load_data(train)
    
    
    model = build_model()
    model.fit(train_x, train_y, batch_size=1, epochs=200, validation_data=(valid_x, valid_y), verbose=1, callbacks = [early_stop])
    
    y_pred = model.predict(valid_x)
    y_true = valid_y

    
    target_idx = np.where(y_true != 0)
    y_pred = y_pred[target_idx]
    y_true = y_true[target_idx]
    
    print(train.columns[1], '의 NMAE: ', nmae(y_true, y_pred))
    
    ## 실제 test데이터 생성
    for date in public_date_list:
        test_df = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/public_data/test_files/test_{date}.csv')
        data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/public_data/train.csv')
        data = pd.concat([data, test_df]).iloc[-window_size:]

        test, norm = normalize(data, col)
        sub_output = model.predict(test.to_numpy().reshape(1,28,2)) * norm[n*2]

        idx = submission[submission['예측대상일자'].str.contains(date)].index
        submission.loc[idx, train.columns[1]] = sub_output[0,[6,13,27]]
    print(submission.iloc[:20,n+1])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 00009: early stopping


InvalidArgumentError: ignored

In [None]:
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/Data/Dacon_data/result/Time-series Numerical Input / Numerical Prediction - Stateful Stack RNN Model.csv', index = False)


In [None]:
submission