<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load" data-toc-modified-id="Load-1"><center style="background-color: #99bbff ; width: 60%">Load</center></a></span><ul class="toc-item"><li><span><a href="#Sales-train" data-toc-modified-id="Sales-train-1.1">Sales train</a></span></li><li><span><a href="#Item-categories" data-toc-modified-id="Item-categories-1.2">Item categories</a></span></li><li><span><a href="#Items" data-toc-modified-id="Items-1.3">Items</a></span></li><li><span><a href="#Shops" data-toc-modified-id="Shops-1.4">Shops</a></span></li><li><span><a href="#Test" data-toc-modified-id="Test-1.5">Test</a></span></li></ul></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2"><center style="background-color: #99bbff ; width: 60%">Preprocessing</center></a></span></li><li><span><a href="#Train-models" data-toc-modified-id="Train-models-3"><center style="background-color: #99bbff ; width: 60%">Train models</center></a></span><ul class="toc-item"><li><span><a href="#XGBoost" data-toc-modified-id="XGBoost-3.1">XGBoost</a></span></li><li><span><a href="#LightGBM" data-toc-modified-id="LightGBM-3.2">LightGBM</a></span></li>
    <li><span><a href="#LSTM" data-toc-modified-id="LSTM-3.3">LSTM</a></span></li>
    </ul></li><li><span><a href="#Submit" data-toc-modified-id="Submit-4"><center style="background-color: #99bbff ; width: 60%">Submit</center></a></span></li></ul></div>

# <center style="background-color:#99bbff; width:60%;">Load</center>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

from pandas_profiling import ProfileReport

sns.set(style='whitegrid', font_scale=1.5)

INPUT_DIR = '../input/competitive-data-science-predict-future-sales'

TARGET = 'item_cnt_month'
RANDOM_STATE = 42

In [None]:
sales_train_df = pd.read_csv(f'{INPUT_DIR}/sales_train.csv')
item_categories_df = pd.read_csv(f'{INPUT_DIR}/item_categories.csv')
items_df = pd.read_csv(f'{INPUT_DIR}/items.csv')
shops_df = pd.read_csv(f'{INPUT_DIR}/shops.csv')
test_df = pd.read_csv(f'{INPUT_DIR}/test.csv')
submission_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

## Sales train

In [None]:
sales_train_df.info()
sales_train_df

In [None]:
profile = ProfileReport(sales_train_df, progress_bar=False, minimal=True)
profile.to_file('sales_train_df.html')
profile.to_notebook_iframe()

## Item categories

In [None]:
item_categories_df.info()
item_categories_df

In [None]:
profile = ProfileReport(item_categories_df, progress_bar=False)
profile.to_file('item_categories_df.html')
profile.to_notebook_iframe()

## Items

In [None]:
items_df.info()
items_df

In [None]:
profile = ProfileReport(items_df, progress_bar=False)
profile.to_file('items_df.html')
profile.to_notebook_iframe()

## Shops

In [None]:
shops_df.info()
shops_df.head(15)

In [None]:
profile = ProfileReport(shops_df, progress_bar=False)
profile.to_file('items_df.html')
profile.to_notebook_iframe()

## Test

In [None]:
test_df.info()
test_df

# <center style="background-color:#99bbff; width:60%;">Preprocessing</center>

In [None]:
pre_df = sales_train_df.copy()
pre_df = pre_df.pivot_table(
    index=['shop_id', 'item_id'],
    values=['item_cnt_day'],
    columns=['date_block_num'],
    fill_value=0,
    aggfunc='sum'
).reset_index()

pre_df

In [None]:
full_train_df = test_df.copy()
full_train_df = full_train_df.merge(pre_df, how='left', on=['shop_id', 'item_id']).fillna(0).drop(
    ['ID', 'shop_id', 'item_id'], axis=1)

full_train_df

In [None]:
X_train, y_train = full_train_df.values[:,:-2], full_train_df.values[:, -2:-1].ravel()
X_valid, y_valid = full_train_df.values[:,1:-1], full_train_df.values[:, -1:].ravel()
X_test = full_train_df.values[:, 2:]

X_train

# <center style="background-color:#99bbff; width:60%;">Train models</center>

## XGBoost

In [None]:
%%time

xgb_model = XGBRegressor(
    # learning_rate=0.05,
    max_depth=16,
    n_estimators=200,
    seed=RANDOM_STATE,
)

xgb_model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="rmse",
              eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=10)

In [None]:
y_pred = xgb_model.predict(X_valid)
print('XGBoost RMSE =', mean_squared_error(y_valid, y_pred, squared=False))

## LightGBM

In [None]:
%%time

lgbm_model = LGBMRegressor(
#     learning_rate=0.05,
    max_depth=16,
    n_estimators=200,
    seed=RANDOM_STATE,
)

lgbm_model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="rmse",
               eval_set=[(X_train, y_train), (X_valid, y_valid.ravel())], verbose=10)

In [None]:
y_pred = lgbm_model.predict(X_valid)
print('LGBM RMSE =', mean_squared_error(y_valid, y_pred, squared=False))

## LSTM

In [None]:
tf.keras.backend.clear_session()

lstm_model = tf.keras.Sequential([
    tf.keras.layers.Reshape(input_shape=(32,), target_shape=(32, 1,)),
    tf.keras.layers.LSTM(units=32, input_shape=(32, 1)),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1)
])

lstm_model.compile(
    loss='mse',
    optimizer=tf.keras.optimizers.Adam(0.1),
    metrics=['mse']
)

lstm_model.summary()

In [None]:
%%time

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_mse', patience=10)

lstm_model.fit(X_train, y_train, batch_size=4096, epochs=30,
          validation_data=(X_valid, y_valid),
          callbacks=[early_stop])

In [None]:
y_pred = lstm_model.predict(X_valid)
print('LSTM RMSE =', mean_squared_error(y_valid, y_pred, squared=False))

# <center style="background-color:#99bbff; width:60%;">Submit</center>

In [None]:
result_df = submission_df.copy()
result_df['XGB'] = xgb_model.predict(X_test)
result_df['LGBM'] = lgbm_model.predict(X_test)
result_df['LSTM'] = lstm_model.predict(X_test)
result_df

In [None]:
# blend
result_df[TARGET] = 0.05 * result_df['XGB'] + 0.05 * result_df['LGBM'] + 0.9 * result_df['LSTM']

submission_df = result_df[['ID', TARGET]]
submission_df.to_csv(f'output.csv', index=False)
submission_df

EOF