In [None]:
#Import the libraries
import pandas as pd
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [None]:
#Preprocessing

###Random seed for reproducibility
np.random.seed(7)

###A function to split a string based on seprator and position of separator
def split(strng, sep, pos):
    strng = strng.split(sep)
    return sep.join(strng[:pos]), sep.join(strng[pos:])

###A function to convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

###Merge submission and visit data
df_sub =pd.read_csv('C:/Users/Akshay/Documents/Rashi/recruit-restaurant-visitor-forecasting/sample_submission.csv')
df_sub[['air_store_id','visit_date']] = df_sub.id.apply( 
   lambda x: pd.Series(split(str(x), "_", 2))) 
df_sub.drop(columns=['id'])
lst = df_sub.air_store_id.unique()
df = pd.read_csv('C:/Users/Akshay/Documents/Rashi/recruit-restaurant-visitor-forecasting/air_visit_data.csv')
medianValue = round(df['visitors'].median())
df = df.append(df_sub, ignore_index=True, sort=False)

###Resample dates and perform median imputation
df.index = pd.to_datetime(df['visit_date'])
df = df.groupby('air_store_id').apply(lambda g: g['visitors'].resample('D').sum()).reset_index()
df['visit_date'] = df['visit_date'].dt.strftime('%Y-%m-%d')
mask = (df['visitors'] == 0) & (df['visit_date'] < '2017-04-23')
df['visitors'][mask] = medianValue
subDF = df

In [None]:
#RNN/LSTM model
i=0
look_back = 7
dfObjSubm = pd.DataFrame(columns=['id', 'visitors'])
for item in lst:
    i+=1
    print(i)
    print(item)
    dr = subDF[subDF['air_store_id'] == item] 
    df_result = dr[dr['visit_date'] >= '2017-04-23']
    dataset = dr['visitors'][dr['visit_date'] < '2017-04-23']
    dataset = dataset.values
    dataset = dataset.astype('float32')
    #Normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset.reshape(-1, 1))
    #Split into train and test data
    train, test = dataset[0:len(dataset),:], dataset[(len(dataset)-look_back-40):len(dataset),:]
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    #Reshape input according to model requirement
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
    #Predict the output
    testPredict = model.predict(testX)
    #Invert predictions
    testPredict = scaler.inverse_transform(testPredict)
    df_result['id'] = df_result.apply(lambda row: row.air_store_id +'_'+ str(row.visit_date).split()[0], axis=1)
    df_result['visitors'] = testPredict
    dfObjSubm = pd.concat([dfObjSubm, df_result[['id','visitors']]])    
dfObjSubm['visitors'] = round(dfObjSubm['visitors'])
dfObjSubm = dfObjSubm.reset_index(drop=True)
print(dfObjSubm)
dfObjSubm.to_csv('file_LSTM.csv', index=False)