In [None]:
import os 
print(os.listdir("../input/competitive-data-science-predict-future-sales/"))

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pylab as plt
import matplotlib.dates as mdates

import seaborn as sns

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv', dtype={'ID': 'int32', 'shop_id': 'int32', 
                                                  'item_id': 'int32'})
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv', 
                              dtype={'item_category_name': 'str', 'item_category_id': 'int32'})
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv', dtype={'item_name': 'str', 'item_id': 'int32', 
                                                 'item_category_id': 'int32'})
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv', dtype={'shop_name': 'str', 'shop_id': 'int32'})
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv', 
                    dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 
                          'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'})

### MLP & TSM
這次的目的是基於上次的EDA以及xgboost的結果，使用不同的方式進行預測。
首先在上次的每日750消費筆數的限制再加上金額不得超過300000的限制（有固定高單價品項干擾）


In [None]:
# remove outliers
sales = sales[(sales.item_price < 300000 ) & (sales.item_cnt_day < 750)]
# sales = sales[(sales.item_price < 300000 ) & (sales.item_cnt_day < 1000)]
train = sales.drop(labels = ['date', 'item_price'], axis = 1)
train.head()

In [None]:
train_clean = pd.merge(train, items.drop(columns=['item_name']), on=['item_id'])
train_clean = train_clean.groupby(["item_id","shop_id","date_block_num",'item_category_id']).sum().reset_index()
train_clean = train_clean.rename(index=str, columns = {"item_cnt_day":"item_cnt_month"})
train_clean = train_clean[["item_id","shop_id","date_block_num",'item_category_id',"item_cnt_month"]]
train_clean.sample(10)

### 計算每日消費量

In [None]:
train_data = train_clean.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_month'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')
train_data.reset_index(inplace = True)
train_data.head(10)

In [None]:
all_data = pd.merge(test,train_data,on = ['item_id','shop_id'],how = 'left')
all_data.fillna(0,inplace = True)
all_data.head()

In [None]:
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        end_ix = i + n_steps
        if end_ix > len(sequences):
            break
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

### 和上一份預測走相反方向，僅使用歷史消費數量進行預測

In [None]:
dummy_samples = [[0] * 34] * 2
all_data.drop(['ID','shop_id','item_id'],inplace = True, axis = 1)
all_data.columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33]
# create dummy_samples dataframe and concatenate with all_data
all_data = pd.concat([pd.DataFrame(dummy_samples), all_data], ignore_index=True)

In [None]:
train_data = np.expand_dims(all_data.values[:,:-2],axis = 2)
validation_data = np.expand_dims(all_data.values[:,1:-1],axis = 2)
test_data = np.expand_dims(all_data.values[:,2:],axis = 2)

In [None]:
from numpy import array
n_steps = 5
X_train, y = split_sequences(train_data, n_steps)
X_val, y_val = split_sequences(validation_data, n_steps)
X_test, y_test = split_sequences(test_data, n_steps)
print(X_train.shape, y.shape)

In [None]:
n_input = X_train.shape[1] * X_train.shape[2]
# reshape to 2 dimensional input
X = X_train.reshape((X_train.shape[0], n_input))
X_val = X_val.reshape((X_val.shape[0], n_input))
X_test = X_test.reshape((X_test.shape[0], n_input))

### 建立基本的3層MLP

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import  tensorflow.keras.optimizers as optimizers
# define model
model = Sequential()
model.add(Dense(128, activation= 'relu', input_dim=n_input))
model.add(Dense(64, activation= 'relu' ))
model.add(Dense(10))
model.compile(optimizer=optimizers.Adam(lr=.0001), loss= 'mse', metrics = ['mean_squared_error'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks = [
    EarlyStopping(patience=5, verbose=1),
    ReduceLROnPlateau(factor=0.25, patience=2, min_lr=0.000001, verbose=1),
    ModelCheckpoint('model.h5', verbose=1, save_best_only=True, save_weights_only=True)
]

In [None]:
model.fit(X, y, epochs=15, callbacks=callbacks, validation_data=(X_val, y_val))

In [None]:
ALPHA = 0.5
BETA = 0.0
N_SESSIONS = 33 

def create_filtered_prediction(train_ts, alpha, beta):
    train_time_filtered_ts = np.zeros((train_ts.shape[0], N_SESSIONS), dtype=np.float)
    train_time_filtered_ts[0, :] = train_ts[0, :]
    train_memontum_ts = np.zeros((train_ts.shape[0], N_SESSIONS), dtype=np.float)
    prediction_ts = np.zeros((train_ts.shape[0], N_SESSIONS+1), dtype=np.float)
    for i in range(1, N_SESSIONS):
        train_time_filtered_ts[:, i] = (1-alpha) * (train_time_filtered_ts[:, i-1] + \
                                                    train_memontum_ts[:, i-1]) + alpha * train_ts[:, i]
        train_memontum_ts[:, i] = (1-beta) * train_memontum_ts[:, i-1] + \
                                  beta * (train_time_filtered_ts[:, i] - train_time_filtered_ts[:, i-1])
        prediction_ts[:, i+1] = train_time_filtered_ts[:, i] + train_memontum_ts[:, i]
    return prediction_ts

all_data_x = all_data.iloc[:,:-1]
all_data_y = all_data.iloc[:,-1:]

train_time_series_df = np.clip(all_data_x.values, 0, 20).astype(float)
predictions = create_filtered_prediction(train_time_series_df, ALPHA, BETA)
# validate prediction by predicting 34th month 
# all_data_x.size()
# predictions
# rmse(predictions, all_data_y)
print("rmse = ",np.sqrt(np.mean((predictions-all_data_y.to_numpy())**2)))

### 結論

這裡僅利用連續月份的銷售數量測試了兩種方式：
1. 基本的NN (MLP)測試
2. 利用time series memontum進行預測
以結果來說rmse好像不是令人滿意的結果，未來希望將每一個item的類別等詳細資訊帶入以優化表現。
以資料分布來說，針對特定類別的銷售數量標準化或許有機會得到更好的結果

