In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# /kaggle/input/competitive-data-science-predict-future-sales/shops.csv
# /kaggle/input/competitive-data-science-predict-future-sales/items.csv
# /kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
# /kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test_data = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

# Data Understanding 
* Adding Month and Year to the dataset to understand how the items sales are moving over months in different yeats
* Plotting sum of item_cnt_day in a month for different years
* Plotting mean of item_prices in a month for different years
* Plotting total sales in a month for different years

In [None]:
sales_train['month'] = pd.to_datetime(sales_train['date']).dt.month
sales_train['year'] = pd.to_datetime(sales_train['date']).dt.year
sales_train.head()


In [None]:
# !pip3 --no-cache-dir install seaborn
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
month_year_group = sales_train.groupby(['month','year']).agg({'item_cnt_day':'sum'}).reset_index()
sns.lineplot(x="month", y="item_cnt_day",style = "year",
             data=month_year_group,markers=True, dashes=False)

In [None]:
month_year_group = sales_train.groupby(['month','year']).agg({'item_price':'mean'}).reset_index()
sns.lineplot(x="month", y="item_price",style = "year",
             data=month_year_group,markers=True, dashes=False)

In [None]:
sales_train['total_sale'] = sales_train['item_cnt_day']*sales_train['item_price']
total_sale_group = sales_train.groupby(['month','year'])['total_sale'].sum().reset_index()
# total_sale_group.head()
sns.lineplot(x="month", y="total_sale",style="year",
             data=total_sale_group,markers=True, dashes=True)

**Understanding Outliers**
1. Plotting Boxplot for item_price to understand if some prices are overly priced.
1. Plotting Boxplot for item_cnt_day to understand if some items were sold in bulk.

In [None]:
#checking distribution for outliers in item_cnt_day and item_price
sns.boxplot(x=sales_train['item_price'])


In [None]:
sns.boxplot(x=sales_train['item_cnt_day'])

**Data Cleaning**

In [None]:
print("Shape before removing less then 0 or greater then 45000 item prices",sales_train.shape)
sales_train = sales_train[(sales_train.item_price > 0) & (sales_train.item_price < 45000)]
print("Shape after removing less then 0 or greater then 45000 item prices",sales_train.shape)

In [None]:
print("Shape before removing less then 0 or greater then 800 item_cnt_day",sales_train.shape)
sales_train = sales_train[(sales_train.item_cnt_day > 0) & (sales_train.item_cnt_day < 800)]
print("Shape after removing less then 0 or greater then 800 item_cnt_day",sales_train.shape)

In [None]:
#removing shops which are not in test set
print("Shape before removing shops and items which are not in test set",sales_train.shape)
sales_train = sales_train[sales_train.shop_id.isin(test_data.shop_id.unique())]
sales_train = sales_train[sales_train.item_id.isin(test_data.item_id.unique())]
print("Shape after removing shops and items which are not in test set",sales_train.shape)

**Data Understanding after removing outliers**

In [None]:
sns.boxplot(x=sales_train['item_price'])

In [None]:
sns.boxplot(x=sales_train['item_cnt_day'])

In [None]:
month_year_group = sales_train.groupby(['month','year']).agg({'item_cnt_day':'sum'}).reset_index()
sns.lineplot(x="month", y="item_cnt_day",style = "year",
             data=month_year_group,markers=True, dashes=False)

In [None]:
month_year_group = sales_train.groupby(['month','year']).agg({'item_price':'mean'}).reset_index()
sns.lineplot(x="month", y="item_price",style = "year",
             data=month_year_group,markers=True, dashes=False)

In [None]:
sales_train['total_sale'] = sales_train['item_cnt_day']*sales_train['item_price']
total_sale_group = sales_train.groupby(['month','year'])['total_sale'].sum().reset_index()
# total_sale_group.head()
sns.lineplot(x="month", y="total_sale",style="year",
             data=total_sale_group,markers=True, dashes=True)

**Training the Model**

In [None]:
sales_train_monthly = sales_train.groupby(['date_block_num','item_id','shop_id']).agg({'item_cnt_day':'sum'}).reset_index()

In [None]:
pivoted_train_data = sales_train_monthly.pivot(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day').fillna(0).reset_index()

In [None]:
pivoted_train_data.shape

In [None]:
test_dataset = pd.merge(pivoted_train_data,test_data,left_on = ['shop_id','item_id'],right_on = ['shop_id','item_id'],how = 'right').fillna(0)

In [None]:
test_dataset.shape

In [None]:
test_data.shape

In [None]:
test_dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)

In [None]:
#why are we expanding it ?
X_train = np.expand_dims(test_dataset.values[:,:-1],axis = 2)
y_train = test_dataset.values[:,-1:]

X_test = np.expand_dims(test_dataset.values[:,1:],axis = 2)

 
print(X_train.shape,y_train.shape,X_test.shape)

In [None]:
# print(X_train.shape,y_train.shape,X_test.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM

model = Sequential()
model.add(LSTM(64,input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(0.5))
model.add(Dense(1))

model.summary()

In [None]:
model.compile(loss='mse',optimizer='adam',metrics = ['mean_squared_error'])

In [None]:
model.fit(X_train,y_train,batch_size = 32,epochs = 10)

In [None]:
submit_data = model.predict(X_test)
submission = pd.DataFrame({'id':test_data['ID'],'item_cnt_month':submit_data.ravel()})
submission.clip(0,20)
submission.head()

In [None]:
submission.to_csv('test_sub.csv',index = False)