In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import time
import calendar
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.model_selection import train_test_split

# Read csv data

In [None]:
train = pd.read_csv('../input/rossmann-store-sales/train.csv')
test = pd.read_csv('../input/rossmann-store-sales/test.csv')
store = pd.read_csv('../input/rossmann-store-sales/store.csv')

# Data PreProcessing

Convert date to a format that can be used for prediction.

In [None]:
train['Date']=pd.to_datetime(train['Date'],format='%Y-%m-%d')
test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')


In [None]:
display(train.head())
display(test.head())
display(store.head())

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
store.isnull().sum()

remove results with null values

convert non numerical values to numerical values

In [None]:
#check what we need to change
store['StoreType'].value_counts()

In [None]:
store['Assortment'].value_counts()

In [None]:
store['StoreType']= store['StoreType'].map({'a':1, 'b' : 2, 'c': 3, 'd' : 4})
store['Assortment'] = store['Assortment'].map({'a':1, 'b' : 2, 'c': 3})

In [None]:
store.head()

In [None]:
data = pd.merge(train, store, on='Store', how='left', copy=False)
data_test = pd.merge(test, store, on='Store', how='left', copy=False)
data.head()


In [None]:
data_test.head()

In [None]:
#remove any na values as we cannot predict with na values
# remove any duplicates to not skep our data
data.dropna(inplace=True)
data_test.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data_test.drop_duplicates(inplace=True)


promo interval needs to be turned into a value
so we will use 1 if the current month is one of the promomonths and 0 otherwise

In [None]:
#credit https://www.kaggle.com/rohinigarg/random-forest-and-xgboost-parameter-tuning
def checkpromomonth(row):
 if (row['Month_Name'] in row['PromoInterval']):
    return 1
 else:
    return 0


In [None]:
data['Promo2'].value_counts()

In [None]:
def data_preprocess(data):
  # remove remaining non numericvalues
  data['StateHoliday']= data['StateHoliday'].map({'0':0, 0: 0,'a':1, 'b' : 2, 'c': 3})
  #date preprocessing
  data['Date'] = pd.to_datetime(data['Date'])
  data['Year'] = data['Date'].dt.year
  data['Month'] = data['Date'].dt.month
  data['Month_Name'] = data['Month'].apply(lambda x: calendar.month_abbr[x])
  data['Day'] = data['Date'].dt.day
  data['Week'] = data['Date'].dt.weekofyear
  #sort out the competetitionopen columns to one column
  data['CompetitionOpenSinceYear'] = data.CompetitionOpenSinceYear.fillna(
      1900).astype(np.int32)
  data['CompetitionOpenSinceMonth'] = data.CompetitionOpenSinceMonth.fillna(
      1).astype(np.int32)
  data['CompetitionOpenSince'] = 12 * (data['Year']- data['CompetitionOpenSinceYear']) + (data['Month'] - data['CompetitionOpenSinceMonth'])
  data['CompetitionOpenSince'] = data['CompetitionOpenSince'].apply(lambda x: x if x > 0 else 0)
  #same with promo
  data['Promo2SinceYear'] = data.Promo2SinceYear.fillna(1900).astype(np.int32)
  data['Promo2SinceWeek'] = data.Promo2SinceWeek.fillna(1).astype(np.int32)
  data['Promo2OpenSince'] = 12 * (data['Year'] - data['Promo2SinceYear']) + (data['Week'] - data['Promo2SinceWeek']) / float(4)
  data['Promo2OpenSince'] = data['Promo2OpenSince'].apply(lambda x: x if x > 0 else 0)
  data['IsPromoMonth'] =  data.apply(lambda row: checkpromomonth(row),axis=1)
  #drop columns that have been used in newly generated columns
  data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis = 1,  inplace = True)
  data.drop(['Promo2SinceYear', 'Promo2SinceWeek','Promo2'], axis = 1,  inplace = True)
  data.drop(['Date', 'Month_Name','PromoInterval'], axis = 1,  inplace = True)
  data.drop(['Year', 'Month'], axis=1, inplace=True)


                                           

In [None]:
data_preprocess(data)
data.head()

In [None]:
data.isnull().sum()

In [None]:
data_preprocess(data_test)
data_test.head()

In [None]:
data_test.isnull().sum()

In [None]:
def data_normalize(data):
    return (data - data.min()) / (data.max() - data.min())


In [None]:
data_n = data_normalize(data)
data_test_n = data_normalize(data_test)

In [None]:
target_n = data_n['Sales']
len(target_n)

In [None]:
data_n.drop(['Sales'], axis=1, inplace=True)
data_n.head()

In [None]:
data_n.values.tolist()
target_n.tolist()
x_train,x_test,y_train,y_test = train_test_split(data_n,target_n, test_size=0.2, random_state=4)
x_train.shape

#Architecture and RNN - LSTM



In [None]:
from numpy import array

In [None]:
x_train= np.array(x_train)
y_train= np.array(y_train)

x_train = x_train.reshape((1,259460, 15))
y_train = y_train.reshape((1,259460, 1))

In [None]:
model = Sequential()

In [None]:
model.add(LSTM(16, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
model.add(LSTM(1, return_sequences=False))
model.add(Dense(1))

model.compile(optimizer = 'adam', loss = 'mean_squared_error')





In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train, epochs=1, batch_size=32, validation_data=(x_test, y_test))

In [None]:
results = model.predicts(x_test)

In [None]:
plt.scatter(range(20, results, c='r'))
plt.scatter(range(20),y_test,c='g'))
plt.show()