In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline 

import tensorflow as tf
from sklearn import preprocessing

import random
tf.random.set_seed(53)
random.seed(53)

In [None]:
BASE = "../input/competitive-data-science-predict-future-sales/"
item_cat = pd.read_csv(BASE+"item_categories.csv")
item = pd.read_csv(BASE+"items.csv")
sales_train = pd.read_csv(BASE+"sales_train.csv")
shops = pd.read_csv(BASE+"shops.csv")
sales_test = pd.read_csv(BASE+"test.csv")

In [None]:
def basic_eda(df):
    print("---------- TOP 5 RECORDS --------")
    print(df.head(5))
    print("---------- INFO -----------------")
    print(df.info())
    print("---------- Describe -------------")
    print(df.describe())
    print("---------- Columns --------------")
    print(df.columns)
    print("---------- Data Types -----------")
    print(df.dtypes)
    print("------- Missing Values ----------")
    print(df.isnull().sum())
    print("------- NULL values -------------")
    print(df.isna().sum())
    print("----- Shape Of Data -------------")
    print(df.shape)

**EDA**

In [None]:
print("============================= Sales Data =============================")
basic_eda(sales_train)

In [None]:
print("============================= Test data =============================")
basic_eda(sales_test)

In [None]:
print("============================= Item Categories =============================")
basic_eda(item_cat)

In [None]:
print("============================= Items =============================")
basic_eda(item)

In [None]:
print("============================= Shops =============================")
basic_eda(shops)

In [None]:
corr = sales_train.corr()
top_corr_features = corr.index[abs(corr["item_cnt_day"])>0]

plt.figure(figsize=(6,6))
g=sns.heatmap(sales_train[top_corr_features].corr(),annot=True,cmap="YlGnBu")

**Data Preprocessing**

In [None]:
# Find outliers in the columns sales_train.'item_cnt_day',sales_train.'item_price'
cols = ['item_cnt_day','item_price']
fig, ax = plt.subplots(ncols = len(cols), figsize = (10 * len(cols),6), sharex = True)
fig.subplots_adjust(wspace=0.2)

for i in range(len(cols)):
  ax[i].boxplot(sales_train[cols[i]])
  ax[i].set_xlabel(cols[i])
  ax[i].set_ylabel("Count")

**Observations:**

item_cnt_day has outlier > 2000
item_price has outlier > 300000

**Next:**

Remove observed outliers

In [None]:
# Remove Outliers

outlier1 = sales_train[sales_train['item_cnt_day'] > 2000].index[0]
outlier2 = sales_train[sales_train['item_price'] > 300000].index[0]
sales_train.drop([outlier1,outlier2], axis = 0, inplace = True)

# Reset index

sales_train.reset_index(inplace=True,drop=True)
sales_train

In [None]:
# Find anomalies in the columns sales_train.'item_cnt_day',sales_train.'item_price'
cols = ['item_cnt_day','item_price']
fig, ax = plt.subplots(ncols = len(cols), figsize = (10 * len(cols),6), sharex = True)
fig.subplots_adjust(wspace=0.2)

for i in range(len(cols)):
  ax[i].plot(sales_train[cols[i]])
  ax[i].set_xlabel(cols[i])
  ax[i].set_ylabel("Count")

**Observations:**

Column 'item_cnt_day' has some negative values which could be items that were 'returned'. Since we will be considering monthly counts, we will leave the negative values in there so that we get a correct count of items sold when we aggregrate by month

**Feature Engineering**

In [None]:
dataset = sales_train.pivot_table(index = ['shop_id','item_id'],
                                  values = ['item_cnt_day'],
                                  columns = ['date_block_num'],
                                  fill_value = 0,
                                  aggfunc='sum')

dataset

In [None]:
# Combine the test_data to train_Data_Nov for prediction
test_Data = sales_test.copy()
test_Data = test_Data.pivot_table(index = ['shop_id','item_id'],fill_value = 0)
Combine_train_test = pd.merge(test_Data, dataset, how = 'left', on = ['shop_id','item_id']).fillna(0)
Combine_train_test = Combine_train_test.sort_values(by = 'ID')
Combine_train_test.head(10)

In [None]:
sales_train.shape

In [None]:
Combine_train_test.shape

In [None]:
# Drop the ID
Combine_train_test = Combine_train_test.drop(columns = ['ID'])

# Train Data
X_train = np.array(Combine_train_test.values[:,:-1]).reshape(np.array(Combine_train_test.values[:,:-1]).shape[0],np.array(Combine_train_test.values[:,:-1]).shape[1], 1)

# The target 
y_train = Combine_train_test.values[:,-1:]

X_test = np.array(Combine_train_test.values[:,1:]).reshape(np.array(Combine_train_test.values[:,1:]).shape[0],np.array(Combine_train_test.values[:,1:]).shape[1], 1)

**Data Preparation**

In [None]:
# dataset.reset_index(inplace = True)
# dataset = pd.merge(sales_test,dataset,on = ['item_id','shop_id'],how = 'left')
# dataset
# dataset.fillna(0,inplace = True)
# dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)
# dataset.shape

# X we will keep all columns execpt the last one 
# X_train = np.expand_dims(dataset.values[:,:-1],axis = 2)
# the last column is our label
# y_train = dataset.values[:,-1:]

# for test we keep all the columns execpt the first one
# X_test = np.expand_dims(dataset.values[:,1:],axis = 2)

# lets have a look on the shape 
# print(X_train.shape,y_train.shape,X_test.shape)

**Model Training and Prediction**

In [None]:
Model_Check_point = tf.keras.callbacks.ModelCheckpoint("Model.h5", 
                                                monitor = 'val_loss',
                                                verbose = 1, 
                                                save_best_only = True)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                            min_delta=0,
                            patience=10,
                            verbose=0, 
                            mode='auto')

lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', 
                            factor=0.5,
                            patience=10)

call_backs = [early_stopping_callback, lr_reducer, Model_Check_point]

In [None]:
def build_model():  
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), input_shape=(33, 1)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Flatten())   
    model.add(tf.keras.layers.Dense(32, activation='relu', kernel_initializer='uniform'))
    model.add(tf.keras.layers.Dense(1))    
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.002), loss = 'mse', metrics=['mse'])
    
    model.summary()  
    
    return model

model = build_model()

In [None]:
model.fit(X_train, y_train, validation_split = 0.2, epochs = 100, batch_size = 512, verbose = 1, callbacks = call_backs)

**Submit Predictions**

In [None]:
model = tf.keras.models.load_model('./Model.h5')
submission = pd.DataFrame(sales_test['ID'])
submission['item_cnt_month'] = model.predict(X_test)
submission.to_csv('submission.csv',index = False)
submission