# Import packages and modules

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import packages and modules
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import timedelta
from keras import backend as K
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.losses import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow import keras

np.random.seed(42)
tf.random.set_seed(42)

# Load data into datasets

In [None]:
# Load data into dataframes
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
df_sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv', parse_dates=['date'], dayfirst=True)
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
df_items.head(5)

In [None]:
df_items.dtypes

In [None]:
df_item_categories.head(5)

In [None]:
df_item_categories.dtypes

In [None]:
df_shops.head(5)

In [None]:
df_shops.dtypes

In [None]:
df_sales_train.head(5)

In [None]:
df_sales_train.dtypes

In [None]:
df_sales_train['item_cnt_day'].sum()

In [None]:
df_test.head(5)

In [None]:
df_test.dtypes

In [None]:
# Count records in each dataframe
print(df_items.shape)
print(df_item_categories.shape)
print(df_sales_train.shape)
print(df_shops.shape)
print(df_test.shape)

# Data cleansing

The 1st and 2nd most purchased items seem to be outliers and let's exclude them.

In [None]:
df_sales_train = df_sales_train[df_sales_train.item_cnt_day < 1000]

Some shops seem to be the same, let's comebine them.

In [None]:
df_sales_train.loc[df_sales_train['shop_id'] == 0, ['shop_id']]  = 57
df_sales_train.loc[df_sales_train['shop_id'] == 1, ['shop_id']]  = 58
df_sales_train.loc[df_sales_train['shop_id'] == 11, ['shop_id']]  = 10

# Transform data into time series

In [None]:
df_train = df_sales_train.pivot_table(index=['shop_id', 'item_id'],
                                      values=['item_cnt_day'], 
                                      columns='date_block_num', 
                                      fill_value=0,
                                      aggfunc=np.sum)
df_train = df_train.reset_index()
df_train = df_train.rename(columns={'item_cnt_day': 'item_cnt_mth'})

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
df_test

# Join test set with the time series, and then training will be based on it

In [None]:
dataset = df_test.merge(df_train, on=['shop_id', 'item_id'], how='left')
dataset.loc[dataset["ID"].isna(), ["ID"]]  = "-1"

dataset = dataset.fillna(0)

In [None]:
dataset

# Define train, valid and test sets

Note that train set will be based on date_block_num 0 to 31, and valid set will be based on date_block_num 1 to 32, and finally test set will be based on date_block_num 1 to 33.

In [None]:
X_train = dataset.drop(columns=['shop_id', 'item_id', 'ID']).values[:, :-2]
y_train = dataset.drop(columns=['shop_id', 'item_id', 'ID']).values[:, -2:-1].clip(0, 20)

X_valid = dataset.drop(columns=['shop_id', 'item_id', 'ID']).values[:, 1:-1]
y_valid = dataset.drop(columns=['shop_id', 'item_id', 'ID']).values[:, -1:].clip(0, 20)

X_test = dataset.drop(columns=['shop_id', 'item_id', 'ID']).values[:, 2:]

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape)

Scale the data sets as neural networks will work better on scaled data.

In [None]:
mm_scaler = preprocessing.MinMaxScaler()
X_train = mm_scaler.fit_transform(X_train)
X_valid = mm_scaler.transform(X_valid)
X_test = mm_scaler.transform(X_test)

X_train = X_train[..., np.newaxis]
X_valid = X_valid[..., np.newaxis]
X_test = X_test[..., np.newaxis]

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape)

Train a model using LSTM.

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

model = keras.models.Sequential([
    keras.layers.LSTM(30, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, input_shape=[None, 1]),
    keras.layers.LSTM(30, dropout=0.3, recurrent_dropout=0.3),
    keras.layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'mse',
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')])

early_stopping = EarlyStopping(
    patience=5, # how many epochs to wait before stopping
    monitor='val_rmse', 
    mode='min',
    restore_best_weights=True,
)

reduceLROnPlat = ReduceLROnPlateau(
    monitor='val_rmse', 
    factor=0.8, 
    patience=2, 
    mode='auto', 
    cooldown=3,
    min_lr=0.00001
)

model.summary()

history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid),
                    callbacks = [early_stopping, reduceLROnPlat])

# Make predictions

In [None]:
prediction = model.predict(X_test)
prediction = prediction.clip(0, 20)

In [None]:
prediction

In [None]:
prediction.shape

In [None]:
submission = pd.DataFrame({'ID': df_test['ID'], 'item_cnt_month': prediction[:,0].reshape(-1)})
#submission['item_cnt_month'] = submission['item_cnt_month'].round(0)

# Generate submission file

In [None]:
submission.to_csv('submission.csv',index=False)