In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

 [amitnikhade.com](http://amitnikhade.com)

# Future sales Prediction using:
* LSTM
* Linear regression

# Pipeline
1. Problem Definition
2. Analyzing data
3. Preparing train and test purpose data
4. creating model
5. Evaluate model
6. Improve performance
7. Present performances
8. Save results

## Load Data

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',  parse_dates=['date'], infer_datetime_format=True, dayfirst=True)

## Analyze data

In [None]:
print(sales.head())
print('____________________________')
print(sales.info())
print('____________________________')
print(sales.describe())

In [None]:
print(test.head())
print('____________________________')
print(test.info())
print('____________________________')
print(test.describe())

In [None]:
print(items.head())
print('____________________________')
print(items.info())
print('____________________________')
print(items.describe())

In [None]:
print(shops.head())
print('____________________________')
print(shops.info())
print('____________________________')
print(shops.describe())

In [None]:
print(item_categories.head())
print('____________________________')
print(item_categories.info())
print('____________________________')
print(item_categories.describe())

In [None]:
print(shops.head())
print('____________________________')
print(shops.info())
print('____________________________')
print(shops.describe())

###  Dataset merging operation

In [None]:
df_item=pd.merge(items,item_categories,on='item_category_id',how='inner')
sales_train=pd.merge(sales,shops,on='shop_id',how='inner')
sales=pd.merge(sales_train,df_item,on='item_id',how='inner')

### keeping only the data in train dataset, present in the test dataset.

In [None]:
sales = sales[sales['shop_id'].isin(test['shop_id'].unique())]
sales = sales[sales['item_id'].isin(test['item_id'].unique())]

In [None]:
sales

### Outliers

In [None]:
import seaborn as sns
sns.boxplot(x=sales.item_cnt_day)

In [None]:
import seaborn as sns
sns.boxplot(x=sales.item_price)

### Remove outliers

In [None]:
sales = sales[(sales.item_price < 300000 )& (sales.item_cnt_day < 1000)]
# remove negative item price
sales = sales[sales.item_price > 0].reset_index(drop = True)

### Aggregation on monthly basis


In [None]:
sales = sales.groupby(["date_block_num","shop_id","item_id"])[['date_block_num','date', 'shop_id', 'item_id', 'item_price', 'item_cnt_day']].agg({"date_block_num":'mean',"date":["min",'max'],"item_price":"mean","item_cnt_day":"sum"})


In [None]:
sales

In [None]:
sales = sales.item_cnt_day.apply(list).reset_index()

In [None]:
sales

**merging test with sales data on item_id and shop_id**

In [None]:
sales_data = pd.merge(test,sales,on = ['item_id','shop_id'],how = 'left')

**fill Nan values with 0 and drop shop_id and item_id**

In [None]:
sales_data.fillna(0,inplace = True)
sales_data.drop(['shop_id','item_id'],inplace = True, axis = 1)

In [None]:
sales_data

**preparing for time series data format**

In [None]:
sales_data = sales_data.pivot_table(index = 'ID', columns='date_block_num', values = 'sum', aggfunc='sum')

In [None]:
sales_data

**Fill NaN values with zero**

In [None]:
sales_data = sales_data.fillna(0)

In [None]:
sales_data.head(20)

**dividing data to features and label like a supervised problem**

In [None]:
X=sales_data[sales_data.columns[:-1]]

In [None]:
y=sales_data[sales_data.columns[-1]]

**splitting to training and testing**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size=0.20, random_state=1)

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
#fitting data
regressor.fit(X_train, Y_train)

## Performance

In [None]:
from sklearn.metrics import mean_squared_error
print('Train set mse:', mean_squared_error(Y_train, regressor.predict(X_train)))
print('Test set mse:', mean_squared_error(Y_test, regressor.predict(X_test)))
print('Test set score:', regressor.score(X_train,Y_train))

In [None]:
#cross validation score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
ts_cross_val = TimeSeriesSplit(n_splits=5)
print('cross_val_score',cross_val_score(regressor, X, y, cv = ts_cross_val, scoring= "neg_mean_squared_error"))

In [None]:
submission = pd.DataFrame({'ID':X_test.index,'item_cnt_month':regressor.predict(X_test)})

In [None]:
submission.to_csv('submission.csv',index = False)

## LSTM
Long short term memory cells for time series

In [None]:
import keras
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.models import load_model, Model

# defining model 
model = Sequential()
model.add(LSTM(units = 128,return_sequences=True,input_shape = (33,1)))
model.add(Dropout(0.5))
model.add(LSTM(units = 64,return_sequences=False,activation='relu'))
# model.add(Dropout(0.5))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16))
model.add(Dense(1))

# opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss = 'mse',optimizer = 'Nadam', metrics = ['mean_squared_error'])
model.summary()

In [None]:
#Splitting data for LSTM's
from sklearn.model_selection import train_test_split
X_train = np.expand_dims(sales_data.values[:,:-1],axis = 2)
y_train = sales_data.values[:,-1:]
model.fit(X_train,y_train, batch_size = 1024,epochs = 10, validation_split=0.1)

In [None]:
# performance 
import matplotlib.pyplot as plt
plt.plot(model.history.history['loss'], label='Train loss')
plt.plot(model.history.history['val_loss'], label='Validation loss')
plt.legend(loc='best')
# plt.title
plt.title('Regular LSTM')
plt.xlabel('Epochs')
plt.ylabel('MSE')

In [None]:
train_pred = model.predict(X_train)

Performance on train data

In [None]:
print('Train rmse:', np.sqrt(mean_squared_error(y_train, train_pred)))

In [None]:
X_test = sales_data.loc[:,sales_data.columns!=sales_data.columns[0]]
y_test = sales_data[sales_data.columns[0]]

In [None]:
icm = model.predict(np.expand_dims(X_test,axis = 2))

performance on test data

In [None]:
print('test rmse:', np.sqrt(mean_squared_error(y_test, icm)))

In [None]:
submission = pd.DataFrame({'ID':X_test.index, 'item_cnt_month':icm.reshape(214200,)})
submission.to_csv('submission.csv',index = False)

# amitnikhade.com[](http://amitnikhade.com)