In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from IPython.display import display
import os
from datetime import datetime
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, Activation, Dropout
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import random
import time
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
sample_sub = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
test.head()

In [None]:
items.head()

In [None]:
sample_sub.head()

In [None]:
sample_sub = sample_sub.drop('item_cnt_month',axis=1)

In [None]:
items.head()

In [None]:
cat.head()

In [None]:
data = train.merge(items[['item_category_id', 'item_id']], how='left', on='item_id')

In [None]:
# finding the top sellers shops.
plt.figure(figsize=(19,8))
sns.countplot(data['shop_id'],palette='viridis')
plt.show()

In [None]:
#lst = [6, 25, 31, 42, 54, 57, 27, 28] # the top sellers shop names
shops[shops['shop_id'].isin([6, 25, 31, 42, 54, 57, 27, 28])]

In [None]:
items_count = pd.DataFrame(data['item_id'].value_counts()) # finding the top sellers items

In [None]:
top_items = items_count[items_count['item_id']>5000]

In [None]:
plt.figure(figsize=(19,8)) 
sns.barplot(x=top_items.index, y="item_id", data=top_items, palette = "Blues")

In [None]:
np.array(top_items.index)

In [None]:
items[items['item_id'].isin([20949,  5822, 17717,  2808,  4181,  7856,  3732,  2308,  4870,
        3734,  1855,  5821, 16787,  6675,  7894,  2445])] # top items name

In [None]:
items_category_count = pd.DataFrame(data['item_category_id'].value_counts()) # finding the top sellers items category

In [None]:
top_items_category = items_category_count[items_category_count['item_category_id']>15000]

In [None]:
plt.figure(figsize=(19,8)) 
sns.barplot(x=top_items_category.index, y="item_category_id", data=top_items_category, palette = "Blues")

In [None]:
np.array(top_items_category.index)

In [None]:
cat[cat['item_category_id'].isin([40, 30, 55, 19, 37, 23, 28, 20, 63, 65, 72, 38, 75, 67, 64, 70, 41,
       57, 21, 71, 69, 43, 62,  3, 22, 49, 35, 31, 25,  6,  2])] # top items category name

##  Data cleaning, outliers , time series trend &cyclical

In [None]:
# reviewing the outlier of items prices
ax = sns.boxplot(data['item_price']) 

In [None]:
# hence will set the maximum for the item price 100,000, as 300,000 cannot even be due to plausible anomalies.
data = data[data['item_price']<100000]

In [None]:
# reviewing the outlier of the number of products sold
ax = sns.boxplot(data['item_cnt_day']) 
# we can see that we have some negative values which has no meaning, the  other outliers can be due to plausible anomalies,as i am going to use LSTM it's not necessary to remove it.

In [None]:
data[data['item_cnt_day']>700]
# checked if that day with high sales is kind of cyclical event every year.

In [None]:
# removing the negative numbers and zero values in number of products sold.
data[data['item_cnt_day']<1].count()

In [None]:
data = data[data['item_cnt_day']>0]

In [None]:
data.describe().T

In [None]:
# price had -1 value 

data = data[data['item_price']>0]

In [None]:
data.isnull().sum()

In [None]:
# correlation between features
plt.figure(figsize=(10,10))

sns.heatmap(data.corr(),cmap='viridis',annot=True)

In [None]:
df = data.copy()

In [None]:
df.pivot_table('item_cnt_day', index='date_block_num', columns='item_category_id', aggfunc='sum').plot(figsize=(19,8))
plt.legend(title="item_category_id", fontsize=10, title_fontsize=15, loc=(1.01, 0.01), ncol=3)
plt.ylabel('Total category per date block');

In [None]:
df.pivot_table('item_cnt_day', index='date_block_num', columns='item_category_id', aggfunc='max').plot(figsize=(19,8))
plt.legend(title="item_category_id", fontsize=10, title_fontsize=15, loc=(1.01, 0.01), ncol=3)
plt.ylabel('Total category per date block');

In [None]:
df_cnt_grouped = df.groupby('date_block_num')['item_cnt_day'].sum()
df_cnt = pd.DataFrame(df_cnt_grouped)
plt.figure(figsize = (15,5))
df_cnt['item_cnt_day'].plot() # sales trend going down over the time
# block 11, 23 is December last 2 years was high season

In [None]:
# as per Kaggle Data Description the training set. Daily historical data from January 2013 to October 2015.
# lets check
print(df['date'].max())
print(df['date'].min())

In [None]:
# fixing the same format
df['date'] = [datetime.strptime(i, "%d.%m.%Y") for i in df['date']]

In [None]:
print(df['date'].max())
print(df['date'].min())

## Downsize the dataset

In [None]:
def downsizing(old_data, item_category_col, item_col, date_block_col, date_col, shop_col, sales_col, new_sales_avg):
    
    """function will adjust the data by item_category and getting the average of the sales,
       depending on the total number inside each category has been sold in the same date and same shop"""
    def agg_d(k):
        return k[0]
    # for downsizing the data will create dictionary for item categories.
    di_item_cat = old_data.groupby(item_category_col)[item_col].apply(lambda g: g.values.tolist()).to_dict()
    
    # remove the duplicted values in the dictionary.
    item_category_dict = {a:list(set(b)) for a, b in di_item_cat.items()}
    
    # group by items categories, and shop ID.
    df_adj = df.groupby([date_block_col, shop_col, item_category_col]).agg({sales_col:np.sum, item_col: 'count', date_col:lambda x : agg_d(list(x))})
    df_adj[new_sales_avg] = df_adj[sales_col] / df_adj[item_col]
    df_adj = df_adj.reset_index()
    return df_adj

In [None]:
df_adj = downsizing(df, 'item_category_id', 'item_id', 'date_block_num', 'date', 'shop_id', 'item_cnt_day', 'item_cnt_day_avg')

In [None]:
df_adj.head()

In [None]:
def blocks_shrinker(data_f, date, size):
    
    """function will balance the date blocks and downsize the blocks rows as you will mention 
       data_f: is the data frame name
       date: date blocks column
       size: the size you want each date block to be"""
    
    # how many rows inside each month
    class_size = data_f[date].value_counts().sort_values()
    adj_size = class_size.iloc[0] - size
    adj_index_size = class_size.index[0]
    
    # down sizing the data and blancing
    if size <= class_size.iloc[0]:
        data_f = data_f.drop(data_f[data_f[date]==adj_index_size].index[:adj_size])

        try_df = data_f.groupby(date)

        try_df = try_df.apply(lambda x: x.sample(try_df.size().min()).reset_index(drop=True))

        try_df = try_df.drop(date, axis=1)
        try_df = try_df.reset_index()
        try_df = try_df.drop('level_1', axis=1)
        return try_df
    else:
        return " The size you enter {0} while the smallest date block is {1}".format(size, class_size.iloc[0])

In [None]:
try_df = blocks_shrinker(df_adj, 'date_block_num', 500)

In [None]:
try_df.head()

In [None]:
new_data = try_df.copy()

In [None]:
new_data.describe().T

### Prepare the train data for LSTM 

In [None]:
scaler = MinMaxScaler()
new_data[['scaled_shop_id', 'scaled_item_category_id']] = scaler.fit_transform(new_data[['shop_id', 'item_category_id']])

In [None]:
new_data = new_data.set_index('date').sort_index()

In [None]:
train_data = new_data[['scaled_shop_id', 'scaled_item_category_id', 'item_cnt_day_avg']]

## Prepare the test data for LSTM 

In [None]:
test.head()

In [None]:
all_test = test.merge(items[['item_category_id', 'item_id']], how='left', on='item_id')

In [None]:
test_conv = all_test.copy()

In [None]:
def get_scaled_dic(full_train_df, item, scaled_item):
    
    """ to get dictionary from scaled values from training data set,
            to scale the test dataset with the same STD & mean"""
    
    
    item_dic = full_train_df.groupby(item)[scaled_item].apply(lambda g: g.values.tolist()).to_dict()
    item_converter = {a:list(set(b)) for a, b in item_dic.items()}
    
    return item_converter

In [None]:
test_conv = test_conv.replace({"item_category_id": get_scaled_dic(new_data, 'item_category_id', 'scaled_item_category_id')})
test_conv = test_conv.replace({"shop_id": get_scaled_dic(new_data, 'shop_id', 'scaled_shop_id')})

In [None]:
test_conv =test_conv.drop('item_id', axis=1)

In [None]:
test_data = test_conv.set_index('ID')

In [None]:
X_train = train_data[['scaled_shop_id', 'scaled_item_category_id']]
y_train = train_data['item_cnt_day_avg']

In [None]:
def get_lstm(train_X_init, train_y_init, n_step, b_size, epoch, verbose=1):
    
    
    
    train_data_gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(X_train, y_train, length=n_step,
                                                                         batch_size=b_size)
  
    
    model = Sequential() 

    model.add(LSTM(128, input_shape=(n_step,2)))
    
    model.add(Dense(128, activation='softmax'))
    model.add(Dense(64, activation='softmax'))
    
    model.add(Dense(1))

    # define the loss function / optimization strategy, and fit
    # the model with the desired number of passes over the data (epochs) 
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit_generator(train_data_gen, epochs=epoch, verbose=1)
    return model

In [None]:
n_step = 500
b_size = 64
epoch = 1

model = get_lstm(X_train, y_train, n_step, b_size, epoch)

In [None]:
def predict_gen(test_X_init):
    
    generator_test_zeros = tf.keras.preprocessing.sequence.TimeseriesGenerator(test_X_init, np.zeros(len(test_X_init)), 
                                                                           length=n_step, batch_size=b_size)
    
    return model.predict(generator_test_zeros, verbose=0)

In [None]:
generator_pred = predict_gen(test_data)

In [None]:
generator_pred.shape

In [None]:
def sub_df(pred_array, yhat, submission_df):
    
    """pred_array: model output
       yhat ('str'): the name of yhat column
       submission_df: the submission data frame """
    
    rr = pd.DataFrame(pred_array, columns=[yhat])
    
    submission = pd.merge(submission_df, rr, left_index=True, right_index=True, how='left')
    
    submission[yhat] = submission[yhat].interpolate()
    
    return submission

In [None]:
submission = sub_df(generator_pred, 'tem_cnt_month', sample_sub)