In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Setup notebook
from pathlib import Path

# import necessary package
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import TimeseriesGenerator

import sklearn
from sklearn import preprocessing

# 畫圖表用
import matplotlib.pyplot as plt

from tensorflow.keras.callbacks import CSVLogger, EarlyStopping
import time

In [None]:
# read the data
comp_dir = Path('../input/store-sales-time-series-forecasting')

store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

oil = pd.read_csv(
    comp_dir / 'oil.csv',
    parse_dates=['date'],
    infer_datetime_format=True,
)

In [None]:
store_sales  = store_sales[(store_sales['date']>'2015-06-01')]
store_sales

In [None]:
store_sales['sales'].max()

In [None]:
#store_sales = store_sales[(store_sales["store_nbr"] == '1') & ( (store_sales["family"] == 'AUTOMOTIVE') | (store_sales["family"] == 'BEVERAGES') ) ]

In [None]:
store_sales.drop(columns=['onpromotion'],inplace=True)
store_sales

In [None]:
dataset = store_sales.pivot_table(index = ['date'],values = ['sales'],columns = ['store_nbr','family'],fill_value = 0)
dataset.columns = ["_".join(x) for x in dataset.columns.ravel()]
dataset

In [None]:
from pandas import DataFrame
from pandas import concat

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg


关于归一化[0,1]，一般指的是Min-Max Normalization

关于归一化，按单独的列进行归一化用的比较多

对全部的列进行归一化，使用sklearn 的MinMaxScaler，使用时MinMaxScaler()函数在进行计算时取的是每列的最大最小值

x' = (x - X_min) / (X_max - X_min)

# 因此我们可以先将array的data进行reshape为向量，将所有的数据看作一列进行计算，此时取到的最大最小值是全部数据的最大最小值，计算完成后reshape为原array的大小

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
data_reshape = dataset.values.reshape([-1, 1])

data_reshape_norm = min_max_scaler.fit_transform(data_reshape)
data_norm = data_reshape_norm.reshape(dataset.shape)

In [None]:
dataset = data_norm

In [None]:
dataset.shape

In [None]:
 (True in np.isnan(dataset))

In [None]:
timesteps_in = 90
timesteps_out = 16
n_features = dataset.shape[1]

In [None]:
data = series_to_supervised(dataset,timesteps_in,timesteps_out)
data_X = data.iloc[:,:-timesteps_out*n_features]
data_y = data.iloc[:,-timesteps_out*n_features:]
data_X.shape, data_y.shape

In [None]:
data

In [None]:
# reshape from [samples, timesteps] into [samples, timesteps, features]

data_X = data_X.values.reshape((data_X.shape[0], timesteps_in, n_features))
data_X.shape,data_y.shape

In [None]:
from sklearn.model_selection import train_test_split
#x_train, x_val, y_train, y_val = train_test_split(data_X,data_y, random_state=11, test_size=0.1)

In [None]:
#x_train.shape,x_val.shape,y_train.shape,y_val.shape

In [None]:
from numpy import array
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.preprocessing.sequence import TimeseriesGenerator


In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor = 0.4, patience = 8, verbose = 1, min_lr = 0.00001)
checkpoint = ModelCheckpoint(filepath='./best.h5', monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)

# 学习率比较大的时候，可能出现 loss: nan的情况。所以得适当降低学习率。
# loss太大，也可能出现loss: nan的情况，通过输入数据的正规化，把loss降低。
# 
LSTM(100,....) 规模大小的网络时，loss在60000左右下不来。 试试加大网络？

In [None]:
# define model
model = Sequential()

model.add(LSTM(300, activation='relu', return_sequences=True, input_shape=(timesteps_in, n_features)))
model.add(LSTM(300, activation='relu'))
model.add(Dense(timesteps_out*n_features))


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.02), loss='mse')
#model.compile(optimizer='adam', loss='mse')
# fit model
#model.fit(X, y, epochs=50, verbose=0)
#model.fit(x_train, y_train, epochs=200, verbose=1, validation_data=(x_val,y_val), callbacks=[reduce_lr])
model.fit(data_X,data_y, epochs=200, verbose=1, callbacks=[reduce_lr,checkpoint])


In [None]:
from keras.models import load_model
model = load_model('./best.h5')

In [None]:
x_input = data_X[-1:,:]
x_input = x_input.reshape((x_input.shape[0], timesteps_in, n_features))
yhat = model.predict(x_input, verbose=0)


In [None]:
np.set_printoptions(suppress=True)
print(yhat[0][0:20])

In [None]:
np.set_printoptions(suppress=True)

predicts = np.squeeze(yhat) / min_max_scaler.scale_[0]

In [None]:
predicts = np.maximum(predicts,0)

In [None]:
submission = pd.read_csv(comp_dir / 'sample_submission.csv')
submission['sales'] = predicts
submission.to_csv('result.csv', index=False)

In [None]:
predicts[0:10]