In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import collections
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


%matplotlib inline
sns.set_style("whitegrid")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Note

* calendar.csv

 * event_type1 and event_type2 contains same categorical value.
 
 * snap_XX: Supplemental Nutrition Assistance Program (SNAP) フードスタンプ制度らしい

* sales_train_validation.csv

 * id: concatenated by item_id and store_id
 
 * d_? column: corresponding to calendar.d; item is sold at d_? day.
 
 
 
 ## Mapping
 
 * price per date: 
 
  * calendar.weekday = sell_prices.wm_yr_wk
  
  * sales_train_validation.store_id = sell_prices.store_id
  
  * sales_train_validation.item_id = sell_prices.item_id
  
 * sale per date:
 
  * calendar.d = sell_prices.d_?

## TODO

* Add moving average 1week, 1month, 6month, 12month

# Load Data

In [None]:
calendar_data = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv') #, index_col=['date'])
sales_train_validation_data = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv') #, index_col=['id'])
sell_prices_data = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv') #, index_col=['store_id', 'item_id','wm_yr_wk'])

In [None]:
display(calendar_data.head())
display(sales_train_validation_data.head())
display(sell_prices_data.head())

Functions

In [None]:
def get_salse_data_without_date(sales_data):
    sale_data_without_date = sales_data.iloc[:, 0:5]
#     display(sale_data_without_date)
    return sale_data_without_date

In [None]:
def get_sale_data_date(sales_data):
    sale_data_date = sales_data.iloc[:, 6:]
#     display(sale_data_date)
    return sale_data_date

In [None]:
def get_format_changed_sale_data(sale_data, index):
    sale_date_data = get_sale_data_date(sale_data).T
    sale_date_data = sale_date_data.iloc[:,:]
    sale_date_data.columns = ['sale_count']

    new_sale_data = sale_date_data.copy()
    new_sale_data['d'] = new_sale_data.index
    for col_name in ['id', 'item_id', 'dept_id', 'cat_id', 'store_id']:
        new_sale_data[col_name] = sale_data.at[index, col_name]

#     display(new_sale_data)
    return new_sale_data

# Fill Missing Data


Check missing data

## Missing Values

* event_name_1     True
* event_type_1     True
* event_name_2     True
* event_type_2     True
* sell_price       fill mean value

### sell_price

In [None]:
def get_sell_price_filled_dataset(dataset):
    sale_mean_value = dataset.mean(numeric_only=True)['sell_price']
    dataset.loc[lambda df: df['sell_price'].isnull(), 'sell_price'] = sale_mean_value
    return dataset

---

# Training & Predict Data

## Feature

In [None]:
# Features
# FEATURES = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id','date','sale_count']
# FEATURES = ['date','id','wm_yr_wk','weekday','wday','month','year','d','snap_CA','snap_TX','snap_WI','item_id','dept_id','cat_id','store_id','sell_price']
FEATURES = ['wm_yr_wk','wday','month','year','snap_CA','snap_TX','snap_WI','sell_price']


# Train All Data

In [None]:
loop_count = 0
# for index, row in sales_train_validation_data.iloc[:,:].iterrows(): 
for index, row in sales_train_validation_data.iloc[0:10,:].iterrows(): 
    
    dataset = sales_train_validation_data.iloc[index:index+1,:]
    
    dataset = get_format_changed_sale_data(dataset, index)
    
    dataset = pd.merge(calendar_data, dataset, on=['d'])
    dataset = pd.merge(dataset, sell_prices_data, how='left' ,on=['wm_yr_wk', 'store_id', 'item_id'])

    dataset = get_sell_price_filled_dataset(dataset)
    
    if loop_count == 0:
        dataset_all = dataset
    else:
        dataset_all = dataset_all.append(dataset, ignore_index=True)
    loop_count += 1


In [None]:
train_dataset = dataset_all.sample(frac=0.8,random_state=0)
test_dataset = dataset_all.drop(train_dataset.index)

train_y = train_dataset.sale_count
train_X = train_dataset[FEATURES]

val_y = test_dataset.sale_count
val_X = test_dataset[FEATURES]

In [None]:
train_stats = train_X.describe()
train_stats = train_stats.transpose()
train_stats

In [None]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_X)
normed_test_data = norm(val_X)

In [None]:
normed_train_data

## Train Models

In [None]:
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(FEATURES)]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

optimizer = tf.keras.optimizers.RMSprop(0.001)

model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['mae', 'mse'])

In [None]:
model.summary()

In [None]:
# example_batch = normed_train_data[:10]
# example_result = model.predict(example_batch)
# model.predict(val_X)

In [None]:
EPOCHS = 1000
# EPOCHS = 100
 
history = model.fit(
  train_X, train_y,
  epochs=EPOCHS, validation_split = 0.2, verbose=0)

In [None]:
# score = model.evaluate(x_test, y_test, batch_size=16)
model.evaluate(val_X, val_y, batch_size=16)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
predict = model.predict(val_X)
predict

In [None]:
x_axis = [i for i in range(0, len(val_y))]

In [None]:
plt.scatter(x_axis, val_y)
plt.scatter(x_axis, predict)
plt.show()