# Load files

## Train data

In [None]:
import pandas as pd

fname_sales_train = '../input/competitive-data-science-predict-future-sales/sales_train.csv'
df_sales_train = pd.read_csv(fname_sales_train)
df_sales_train

In [None]:
df_sales_train.isnull().sum()

## Shop data

In [None]:
fname_shops = '../input/competitive-data-science-predict-future-sales/shops.csv'
df_shops = pd.read_csv(fname_shops)
df_shops

In [None]:
df_shops.isnull().sum()

## Item data

In [None]:
fname_items = '../input/competitive-data-science-predict-future-sales/items.csv'
df_items = pd.read_csv(fname_items)
df_items

In [None]:
df_items.isnull().sum()

## Category data

In [None]:
fname_item_categories = '../input/competitive-data-science-predict-future-sales/item_categories.csv'
df_item_categories = pd.read_csv(fname_item_categories)
df_item_categories

In [None]:
df_item_categories.isnull().sum()

## Test data

In [None]:
fname_test = '../input/competitive-data-science-predict-future-sales/test.csv'
df_test = pd.read_csv(fname_test)
df_test

# Preprocessing

## Remove outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

plt.figure(figsize=(12, 3))
sns.boxplot(data=df_sales_train.item_price, orient='h')

In [None]:
# Remove large outliers
df_sales_train = df_sales_train[df_sales_train.item_price < 100000]

In [None]:
plt.figure(figsize=(12, 3))
sns.boxplot(data=df_sales_train.item_cnt_day, orient='h')

In [None]:
# Remove large outliers
df_sales_train = df_sales_train[df_sales_train.item_cnt_day < 900]

## Count monthly sales

In [None]:
df = df_sales_train.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day', fill_value=0, aggfunc='sum')
df

In [None]:
df = df.reset_index()
df.columns.name = None
df

## Merge past data to test one

In [None]:
df_test_src = pd.merge(df_test, df, how='left', on=['shop_id', 'item_id'])
df_test_src = df_test_src.drop(['ID'], axis=1).fillna(0)
df_test_src

## Get explanatory and objective variables

In [None]:
X_train = df.iloc[:, 2:-1]
X_train

In [None]:
y_train = df.iloc[:, -1]
y_train

In [None]:
X_test = df_test_src.iloc[:, 3:]
X_test

## Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler()
X_train = X_scaler.fit_transform(X_train)
X_test  = X_scaler.transform(X_test)

# LSTM Modeling

## Neural Network

In [None]:
import tensorflow as tf

input = tf.keras.layers.Input(shape=(33, 1))

x = input
x = tf.keras.layers.LSTM(32, return_sequences=True, dropout=0.1)(x)
x = tf.keras.layers.LSTM(32, return_sequences=True, dropout=0.1)(x)
x = tf.keras.layers.LSTM(32)(x)

output = tf.keras.layers.Dense(1)(x)

model = tf.keras.models.Model(input, output)
model.summary()

In [None]:
model.compile(optimizer='adam', loss='mse')

# Learning

In [None]:
import numpy as np

epochs = 100
batch_size = 64

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
]

history = model.fit(np.expand_dims(X_train, axis=-1),
                    y_train,
                    validation_split=0.2,
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=callbacks)

# Prediction

In [None]:
y_pred = model.predict(np.expand_dims(X_test, -1))
y_pred

In [None]:
y_pred = y_pred.squeeze().clip(0, 20)
y_pred

In [None]:
answer = pd.concat([df_test['ID'], pd.Series(y_pred.squeeze(), name='item_cnt_month')], axis=1)
answer

In [None]:
fname_output = './submission.csv'
answer.to_csv(fname_output, index=False)