参考notebook:<br>
・https://www.kaggle.com/takahiro1127/starter-data-exploration-lstm<br>
・https://www.kaggle.com/robikscube/m5-forecasting-starter-data-exploration

In [None]:
#ライブラリインポート
import os
import re
import time
import warnings
from tqdm import tqdm
import numpy as np
from numpy import array
from numpy import newaxis
import pandas as pd 
from sklearn import preprocessing, metrics
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import RepeatVector,TimeDistributed
from keras.models import Sequential

#環境設定
os.environ['TF_CPP_MIN_LOG_LEGEL']='3'
warnings.filterwarnings('ignore')

#データインポート
cal = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")
ss = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sample_submission.csv")
sellp = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
stv = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv")

In [None]:
#モデル生成関数
def build_model(n_features,n_out_seq_length,num_y):
    
    model = Sequential()
    
    model.add(LSTM(128, activation='relu', input_shape=(28, n_features),return_sequences=False))
    model.add(RepeatVector(n_out_seq_length))
    
    model.add(LSTM(32, activation='relu',return_sequences=True))
    
    model.add(TimeDistributed(Dense(num_y)))
    model.compile(optimizer='adam', loss='mse')

    start = time.time()
    print('実行時間：　', time.time()-start)
    return model

In [None]:
#正規化する関数（最大値を0,最大値を1にする）
def Normalize(list):
    list = np.array(list)
    #配列の状態から行列に変換
    low, high = np.percentile(list, [0, 100])
    delta = high - low
    if delta != 0:
        for i in range(0, len(list)):
            list[i] = (list[i]-low)/delta
    return  list,low,high

#正規化から戻す関数
def FNoramlize(list,low,high):
    delta = high - low
    if delta != 0:
        for i in range(0, len(list)):
            list[i] = list[i]*delta + low
    return list

In [None]:
###calendarの文字データを数値に変換する時に使う関数
def transform(data):
    #nanを"unknown"で埋める
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
    #文字データを数値化
    cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

In [None]:
###sell_price.csvの整形###
#販売価格とcalendarの結合
price_fea = cal[['wm_yr_wk','date']].merge(sellp, on = ['wm_yr_wk'], how = 'left')
#商品を縦軸、日にちを横軸にして値段をpivot
price_fea['id'] = price_fea['item_id']+'_'+price_fea['store_id']+'_validation'
df = price_fea.pivot('id','date','sell_price')
#値段の遷移をstvとmergeし、遷移だけを取り出し、indexを揃えた
price_df = stv.merge(df,on=['id'],how= 'left').iloc[:,-145:]
price_df.index = stv.id

In [None]:
###validation.csvの整形###
#定数
days_val = range(1, 1914)
#validation.csvから売り上げデータのみを抽出
time_series_columns = [f'd_{i}' for i in days_val]
time_series_data = stv[time_series_columns]

In [None]:
###calendar.csvの整形###
#定数
days_cal = range(1, 1970)
x_label = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI','wday_bool']
#週末を1,0に変換
cal.loc[cal['wday'] < 3, 'wday_bool'] = 1
cal.loc[cal['wday'] >= 3, 'wday_bool'] = 0
#calデータのデータ整形（転置、0埋め）
time_series_columns = [f'd_{i}' for i in days_cal]
transfer_cal = pd.DataFrame(cal[x_label].values.T, index=x_label, columns= time_series_columns)
transfer_cal = transfer_cal.fillna(0)
#古いデータ削除
cal['date'] = pd.to_datetime(cal['date'])
cal = cal[cal['date']>= '2016-1-27']
#文字データの数値化
cal= transform(cal)
#データフレーム化
transfer_cal = pd.DataFrame(cal[x_label].values.T,index=x_label)

In [None]:
###学習、テストで使用するマスターデータ作成###
#定数
data_len, step_len = 100,28

#マスターデータ
X_data = []

#price,calender,validationのデータを結合してマスターデータを作成
for i in tqdm(range(time_series_data.shape[0])):
    X_data.append([list(t) for t in zip(time_series_data.iloc[i][-data_len:],
                                   transfer_cal.loc['event_type_1'][-(data_len+step_len):-(step_len)],
                                   transfer_cal.loc['event_type_2'][-(data_len+step_len):-(step_len)],
                                   transfer_cal.loc['snap_CA'][-(data_len+step_len):-(step_len)],
                                   transfer_cal.loc['snap_TX'][-(data_len+step_len):-(step_len)],
                                   transfer_cal.loc['snap_WI'][-(data_len+28):-(step_len)],
                                   transfer_cal.loc['wday_bool'][-(data_len+28):-(step_len)],
                                   price_df.iloc[i][-(data_len+28):-(step_len)])]) 
X_data = np.asarray(X_data, dtype=np.float32)

In [None]:
#定数
n_steps = 28
n_features = 8
n_out_seq_length =28
num_y = 1
n_items = 30490 # number of traindata

###学習用データ作成###

#X_dataのうち、56日間のみのデータを抽出、それを正規化する。(y:d_1857~d_1913 x:d_1885~d_1941)
train_n,train_low,train_high = Normalize(X_data[:,-(n_steps*2):,:])

#d_1857~d_1885の目的変数とd_1885~d_1913の説明変数を抽出
X_train = train_n[:,-28*2:-28,:]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))

#d_1885~d_1913の目的変数（売り上げ）を抽出
y = train_n[:,-28:,0] 
y = y.reshape((y.shape[0], y.shape[1], 1))

###モデル構築、学習###

model = build_model(n_features,n_out_seq_length,num_y)
model.fit(X_train, y, epochs=15, batch_size=1000)


###テスト用データ作成###

#x_trainから与えられた28日のinput用の説明変数を抽出（d_1913~d_1941）
x_input = array(X_train[:,-n_steps*1:])
x_input = x_input.reshape((n_items, n_steps*1, n_features))

###テスト###

#与えられた28日のinput用の説明変数から提出用の目的変数（売り上げ）を出力、これまでの売り上げと結合
y_predict = model.predict(x_input[:,-n_steps:], verbose=0)
x_input = np.concatenate((x_input[:,:,0].reshape(x_input.shape[0],x_input.shape[1]),y_predict.astype(np.float32).reshape(x_input.shape[0],x_input.shape[1])),axis=1).reshape((x_input.shape[0],x_input.shape[1]+28,1))

###submit用のデータ調整###

#正規化解除と整数化（丸め込み）
x_input = FNoramlize(x_input,train_low,train_high)
x_input = np.rint(x_input)

#予測部位をデータフレーム化、カラム追加、0未満データを0にする
forecast = pd.DataFrame(x_input.reshape(x_input.shape[0],x_input.shape[1])).iloc[:,-28:]
forecast.columns = [f'F{i}' for i in range(1, forecast.shape[1] + 1)]
forecast[forecast < 0] =0

#submit用のデータ整形（id追加とvalidation、evaluation用の予測データ複製）
validation_ids = stv['id'].values
evaluation_ids = [i.replace('validation', 'evaluation') for i in validation_ids]
ids = np.concatenate([validation_ids, evaluation_ids])

predictions = pd.DataFrame(ids, columns=['id'])
forecast = pd.concat([forecast]*2).reset_index(drop=True)
predictions = pd.concat([predictions, forecast], axis=1)
predictions.to_csv('submission.csv', index=False)  #Generate the csv file.