In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("/kaggle/input/demand-forecasting-kernels-only/train.csv", parse_dates=True, index_col =0)
train[:5]

In [None]:
print('Min date of test set: %s' % train.index.min().date())
print('Max date of test set: %s' % train.index.max().date())

In [None]:
test = pd.read_csv("/kaggle/input/demand-forecasting-kernels-only/test.csv", parse_dates=True, index_col =1)
test[:5]

In [None]:
print('Min date of test set: %s' % test.index.min().date())
print('Max date of test set: %s' % test.index.max().date())

In [None]:
lag_size = (test.index.max().date() - train.index.max().date()).days
print('Forecast lag size', lag_size)

In [None]:
sales = train.iloc[:,-1].values
train_data = train.drop(columns=['sales'])
train_data['year'] = train_data.index.year-train_data.index.year.min()
train_data['month'] = train_data.index.month
train_data['day'] = train_data.index.day
train_data['day_of_week'] = train_data.index.dayofweek
train_data['week_of_year']  = train_data.index.weekofyear
train_data[:5]

In [None]:
train_data.info()

In [None]:
# Determine the embedding size for each category
def embsize(cat_vars):
    cat_sizes = {}
    cat_embsizes = {}
    for cat in cat_vars:
        cat_sizes[cat] = train_data[cat].nunique()
        cat_embsizes[cat] = min(50, cat_sizes[cat]//2+1) # take half the number of unique values then add one 
    return cat_sizes, cat_embsizes

In [None]:
# SMAPE evaluation metric
import keras.backend as K
import tensorflow as tf
def my_smape(f,a):
    def smape(F, A):
        return K.mean(2*K.abs(F-A)/(K.abs(F)+K.abs(A)))
    return smape(f,a)

In [None]:
from keras.layers import Input, Embedding, Reshape, Concatenate, Dense
from keras.models import Model
from keras import optimizers
def build_nn(cat_vars, cont_vars, cat_sizes, cat_embsizes, learning_rate):
    cont_input = Input((len(cont_vars),), name='cont_vars')
    inp = [cont_input]
    concat = [cont_input]
    for cat in cat_vars:
        cat_input = Input((1,), name=cat)
        inp.append(cat_input)
        cat_emb = Embedding(cat_sizes[cat]+1, cat_embsizes[cat], input_length=1)(cat_input)
        cat_reshape = Reshape((cat_embsizes[cat],))(cat_emb)
        concat.append(cat_reshape)
    concat_layer = Concatenate()(concat)
    Dense_layer1 = Dense(128, kernel_initializer = 'uniform', activation= 'relu')(concat_layer)
    Dense_layer2 = Dense(1)(Dense_layer1)
    model = Model(inp, Dense_layer2)
    sgd = optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=sgd, loss=my_smape)
    return model

In [None]:
 def transform_data(x, cont, cat_vars):
    X = []
    X.append(x[cont].astype('float32').values)
    for cat in cat_vars:
        X.append(x[cat].values)
    return X

<br>
<br>
<br>
<br>
<br>
<br>

In [None]:
cat_best = ['item', 'store', 'year','month','day','day_of_week','week_of_year']
cont_best = []

In [None]:
cat_sizes, cat_embsizes = embsize(cat_best)

In [None]:
from sklearn.model_selection import train_test_split
seed = 7
x_train, x_val, y_train, y_val = train_test_split(train_data, sales, test_size=0.33, random_state=seed, shuffle = True)

In [None]:
# Structuring data as a list of numpy arrays for training and validation in the Keras model 
X_train = transform_data(x_train, cont_best, cat_best)
X_val = transform_data(x_val, cont_best, cat_best)

In [None]:
# Tune the number of epochs
model = build_nn(cat_best, cont_best, cat_sizes, cat_embsizes, 0.01)
results = model.fit(X_train, y_train, validation_data=[X_val, y_val], epochs=10, batch_size=64, verbose=0)
df2 = pd.DataFrame(results.history)
df2

In [None]:
epochs_fit = 9

In [None]:
# Tune the learning rate
learning_rates = [0.01, 0.02, 0.03]
for rate in learning_rates:
    model_r = build_nn(cat_best, cont_best, cat_sizes, cat_embsizes, rate)
    model_r.fit(X_train, y_train, epochs=epochs_fit, batch_size=64, verbose=0)
    smape = model_r.evaluate(X_val, y_val,verbose=0)
    print('smape: %f using %f' % (smape, rate))

In [None]:
rate_fit = 0.03

In [None]:
data_train = transform_data(train_data, cont_best, cat_best)

In [None]:
# Build model with fine parameters
model = build_nn(cat_best, cont_best, cat_sizes, cat_embsizes, rate_fit)
model.fit(data_train, sales, epochs=epochs_fit, batch_size=64, verbose=0)
model.summary()

In [None]:
test['year'] = test.index.year-train_data.index.year.min()
test['month'] = test.index.month
test['day'] = test.index.day
test['day_of_week'] = test.index.dayofweek
test['week_of_year'] = test.index.weekofyear
test[:5]

In [None]:
X_test = transform_data(test, cont_best, cat_best)

In [None]:
test_preds = model.predict(X_test)
submission = pd.read_csv("/kaggle/input/demand-forecasting-kernels-only/sample_submission.csv", index_col=0)
submission['sales'] = test_preds
submission.to_csv('submission.csv')

In [None]:
submission[:5]