In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
%matplotlib inline

from xgboost import XGBRegressor
import xgboost as xgb

from keras.layers import Dense, InputLayer
from keras.layers import SimpleRNN, LSTM
from keras.models import Sequential
import keras.backend as K

from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/rossmann-store-sales/train.csv')
test = pd.read_csv('../input/rossmann-store-sales/test.csv')
submission = pd.read_csv('../input/rossmann-store-sales/sample_submission.csv')
store = pd.read_csv('../input/rossmann-store-sales/store.csv')

In [None]:
train.info()

In [None]:
train['Date'] = pd.to_datetime(train['Date'])
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day

In [None]:
# train['StateHoliday'] 의 '0'과 0이 따로 분리 되어 있는데 data설명을 읽어보면 같다는 것을 알 수 있습니다.
train['StateHoliday'].unique()

In [None]:
train.StateHoliday.replace({'0' : 0,
                            'a' : 1,
                            'b' : 2,
                            'c' : 3}, inplace=True)

In [None]:
train.isna().sum()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test['Date'] = pd.to_datetime(test['Date'])
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day
test['StateHoliday'].unique()

In [None]:
test.StateHoliday.replace({'0' : 0,
                           'a' : 1}, inplace=True)

In [None]:
test.isna().sum()

In [None]:
test[test.Open.isna()]

In [None]:
# Open Nan값인 데이터들 찾아보니 평일이고 공휴일도 없어서 전부 1을 채워 넣었습니다.
test.Open.fillna(1, inplace=True)

In [None]:
# object를 포함하는 칼럼을 추려내고 라벨링을 진행합니다.
store.head()

In [None]:
# StoreType, Assortment, PromoInterval의 dtype이 object인 것을 볼 수 있습니다.

store.info()

In [None]:
# StoreType, Assortment은 Nan값이 없으니 LabelEncoder를 이용하여 간단하게 라벨링을 원핫인코딩을 진행합니다.

Labeling = LabelEncoder()
store['StoreType'] = Labeling.fit_transform(store['StoreType'])
store['Assortment'] = Labeling.fit_transform(store['Assortment'])

# PromoInterval의 값을 보면 총 4개씩 존재하고 느낌이 분기별 값을 나타내는 것 같아서 PromoInterval을 1~4분기로 나누어 각각 저장하고 들어가는 값을 확인해봤습니다.

store[['FirstQuarter', 'SecondQuarter', 'ThirdQuarter', 'FourtQquarter']] = store['PromoInterval'].str.split(',', expand=True)
store.drop('PromoInterval', axis=1, inplace=True)

In [None]:
#분리 후 값을 확인해보니 분기마다의 월을 나타내는 것을 알 수 있습니다.

print(store['FirstQuarter'].unique())
print(store['SecondQuarter'].unique())
print(store['ThirdQuarter'].unique())
print(store['FourtQquarter'].unique())

In [None]:
# 각 분기별 월을 월 핫 인코딩을 진행해 줍니다.

store.FirstQuarter.replace({'Jan':1,
                            'Feb':2,
                            'Mar':3}, inplace=True)

store.SecondQuarter.replace({'Apr':1,
                            'May':2,
                            'Jun':3}, inplace=True)

store.ThirdQuarter.replace({'Jul':1,
                            'Aug':2,
                            'Sept':3}, inplace=True)

store.FourtQquarter.replace({'Oct':1,
                            'Nov':2,
                            'Dec':3}, inplace=True)

In [None]:
# 각 value가 분리되고 변화된 것을 눈으로 볼 수 있습니다.

store.head()

In [None]:
# 전체 데이터 중 Nan의 비율을 확입합니다.

print(100 * store.isna().sum() / len(store))

In [None]:
# 일단 mdian을 채우면 이상할 거 같아 Nan값을 0으로 채워 줍니다.

store.fillna(0, inplace=True)

In [None]:
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

train.sort_values(by=['Store', 'Date'], inplace=True)
test.sort_values(by=['Store', 'Date'], inplace=True)

train.set_index(['Date'], inplace=True)
test.set_index(['Date'], inplace=True)

In [None]:
scaler = MinMaxScaler()
#train['CompetitionDistance'] = scaler.fit_transform(train['CompetitionDistance'].values.reshape(-1, 1))
#test['CompetitionDistance'] = scaler.transform(test['CompetitionDistance'].values.reshape(-1, 1))

In [None]:
train_x = train.drop(['Sales', 'Customers'], axis=1)
train_y = train[['Sales']]
#train_x_col = train_x.columns
#train_y_col = train_y.columns

train_x[train_x.columns] = scaler.fit_transform(train_x[train_x.columns])
test[train_x.columns] = scaler.transform(test[train_x.columns])

train_x = train_x.values
train_y= train_y.values
test=test.values

test = test.reshape(test.shape[0], test.shape[1], 1)


x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size=.2)


x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
x_val = x_val.reshape(x_val.shape[0], x_val.shape[1], 1)

#test.drop(['Id'], axis=1, inplace=True)
#dtrain = xgb.DMatrix(data=x_train[train_x_col], label = y_train[train_y_col])
#dval = xgb.DMatrix(data=x_val[train_x_col], label=y_val[train_y_col])
#dtest = xgb.DMatrix(data=test[train_x_col])

In [None]:
my_batch = 64
my_epoch = 150
my_neuron = 1000  # RNN, LSTM 사용되는 parameter

In [None]:
K.clear_session()
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(train_x.shape[1], 1)))
# model.add(SimpleRNN(my_neuron))
model.add(tf.compat.v1.keras.layers.CuDNNLSTM(my_neuron, go_backwards=True))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(my_neuron, activation='relu'))
model.add(tf.keras.layers.Dropout(.3))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(my_neuron, activation='relu'))
model.add(tf.keras.layers.Dropout(.3))
model.add(tf.keras.layers.Dense(1, activation='linear'))

model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='mse', metrics=['acc'])
model.fit(x_train, y_train, batch_size=my_batch, validation_split=.2, epochs=my_epoch, use_multiprocessing=True, verbose=1)

In [None]:
# RNN Evaluate
score = model.evaluate(x_val, y_val, verbose=1)
print('Loss:' + format(score[0], "1.3f"))

In [None]:
pred = model.predict(test)
pred

In [None]:
submission['Sales'] = pred

In [None]:
submission.to_csv('./sample.csv', index=False)