# LSTMで予測

1. データをある店舗で売っているある商品(商品id×店舗id)に分類する
2. 2013/1～2015/9のデータを使って2015/10の売り上げを予測する
3. 説明変数は月売り上げ、
4. train/val/testの比率は

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
sales = pd.read_csv('../data/sales_train_v2.csv')

In [3]:
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [4]:
# データのバイト数を落としてメモリを節約する

def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

sales = downcast_dtypes(sales)
print(sales.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int16
shop_id           int16
item_id           int16
item_price        float32
item_cnt_day      float32
dtypes: float32(2), int16(3), object(1)
memory usage: 61.6+ MB
None


In [5]:
# データをピボットテーブルで集計　※https://note.nkmk.me/python-pandas-pivot-table/
# 列：[item_id, shop_id]、行：date_block_num(月)、値：item_cnt_day(売れた商品の数)

sales_by_item_id_shop_id = sales.pivot_table(index=['item_id', 'shop_id'], values=['item_cnt_day'],
                                        columns='date_block_num', aggfunc=np.sum, fill_value=0).reset_index()
sales_by_item_id_shop_id.columns = sales_by_item_id_shop_id.columns.droplevel().map(str)
sales_by_item_id_shop_id = sales_by_item_id_shop_id.reset_index(drop=True).rename_axis(None, axis=1)
sales_by_item_id_shop_id.columns.values[0] = 'item_id'
sales_by_item_id_shop_id.columns.values[1] = 'shop_id'

display(sales_by_item_id_shop_id.head())

Unnamed: 0,item_id,shop_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(sales_by_item_id_shop_id, test_size=0.2, random_state=10)

In [7]:
# データを説明変数と目的変数に分割し、標準化する
def encord_data(data):    
    x_ori = data.iloc[:,2:-1].values
    x = (x_ori-x_ori.mean()/x_ori.std(ddof=1))  # 不偏標準偏差で標準化している
    x = x.reshape((-1,33,1))
    y_ori = data.iloc[:,-1].values
    y_mean = np.mean(y_ori)
    y_std = np.std(y_ori, ddof=1)
    y = (y_ori-y_ori.mean()/y_ori.std(ddof=1))  # 不偏標準偏差で標準化している
    y = y.reshape((-1,1))
    y_label = np.concatenate([data.iloc[:,0:2].values, y_ori.reshape([-1,1])], axis=1)
    return x, y, y_label, y_mean, y_std

In [8]:
train_x ,train_y, _, _, _ = encord_data(train)
test_x ,test_y, test_y_label, test_y_mean, test_y_std = encord_data(test)

(339299, 2)
(339299,)
(84825, 2)
(84825,)


In [9]:
print(train_y.shape)
print(train_x.shape)

(339299, 1)
(339299, 33, 1)


In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [11]:
hidden_depth = 1
hidden_layers = 16
batch_size = 16
hidden_activation = "relu"
train_x.shape

(339299, 33, 1)

In [12]:
model = Sequential()
model.add(LSTM(hidden_layers, input_shape=(train_x.shape[1], train_x.shape[2]), activation=hidden_activation, recurrent_dropout=0.5, return_sequences=False))
while hidden_depth>1:
    model.add(lstm(hidden_layers, activation=hidden_activation, recurrent_dropout=0.5, return_sequences=False))
    hidden_depth -= 1
model.add(Dense(1))
# model.add(TimeDistributed(Dense(1)))

model.compile(loss="mse", optimizer="adam")

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 16)                1152      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,169
Trainable params: 1,169
Non-trainable params: 0
_________________________________________________________________


In [13]:
callbacks = []
callbacks.append(ModelCheckpoint('.model/simple_LSTM.hdf5', save_best_only=True))

In [14]:
history = model.fit(x=train_x, y=train_y, batch_size=batch_size, validation_split=0.2, callbacks=callbacks)

Train on 271439 samples, validate on 67860 samples
Epoch 1/1


In [15]:
test_y_pre = model.predict(test_x)
test_y_pre_fix = (test_y_pre + test_y_mean) * test_y_std

In [16]:
result_check = pd.DataFrame(np.concatenate([test_y_label, test_y_pre_fix.reshape(-1,1)], axis=1))
result_check.columns = ["item_id","shop_id","正解値","予測値"]

In [17]:
result_check.head()

Unnamed: 0,item_id,shop_id,正解値,予測値
0,13646.0,54.0,0.0,0.518371
1,16594.0,1.0,0.0,0.480425
2,12363.0,38.0,0.0,0.625778
3,18481.0,45.0,0.0,0.480634
4,5509.0,51.0,0.0,0.544735


In [18]:
result_check.to_csv("./result/simple_LSTM.csv", encoding='cp932')

In [19]:
from sklearn.metrics import mean_squared_error
from numpy import sqrt
rmse = sqrt(mean_squared_error(test_y,test_y_pre))
print('Val RMSE: %.3f' % rmse)

Val RMSE: 2.060
