In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# - Data Understanding

In [2]:
train = pd.read_csv('train.csv.zip')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv.zip')


In [3]:
print(train.shape)
train.columns

(421570, 5)


Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday'], dtype='object')

In [4]:
print(stores.shape)
stores.columns

(45, 3)


Index(['Store', 'Type', 'Size'], dtype='object')

In [5]:
print(features.shape)
features.columns

(8190, 12)


Index(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday'],
      dtype='object')

In [6]:
Data = train.merge(stores, how='left').merge(features, how='left')

Data['Date'] = pd.to_datetime(Data['Date'])
Data['Year'] = pd.to_datetime(Data['Date']).dt.year
Data['Month'] = pd.to_datetime(Data['Date']).dt.month
Data['Week'] = pd.to_datetime(Data['Date']).dt.week
Data['DayOfTheMonth'] = pd.to_datetime(Data['Date']).dt.day

Data = Data.sort_values(by='Date')

In [7]:
Data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Week,DayOfTheMonth
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106,2010,2,5,5
277665,29,5,2010-02-05,15552.08,False,B,93638,24.36,2.788,,,,,,131.527903,10.064,2010,2,5,5
277808,29,6,2010-02-05,3200.22,False,B,93638,24.36,2.788,,,,,,131.527903,10.064,2010,2,5,5
277951,29,7,2010-02-05,10820.05,False,B,93638,24.36,2.788,,,,,,131.527903,10.064,2010,2,5,5
278094,29,8,2010-02-05,20055.64,False,B,93638,24.36,2.788,,,,,,131.527903,10.064,2010,2,5,5


In [8]:
Data.tail()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Week,DayOfTheMonth
379648,40,87,2012-10-26,24638.96,False,A,155083,49.65,3.917,3605.71,55.98,0.28,486.81,1389.06,138.728161,4.145,2012,10,43,26
181170,19,30,2012-10-26,3740.12,False,A,203819,56.49,4.071,5430.75,90.07,,904.34,1665.77,138.728161,7.992,2012,10,43,26
181313,19,31,2012-10-26,3128.17,False,A,203819,56.49,4.071,5430.75,90.07,,904.34,1665.77,138.728161,7.992,2012,10,43,26
181599,19,33,2012-10-26,5740.14,False,A,203819,56.49,4.071,5430.75,90.07,,904.34,1665.77,138.728161,7.992,2012,10,43,26
421569,45,98,2012-10-26,1076.8,False,B,118221,58.85,3.882,4018.91,58.08,100.0,211.94,858.33,192.308899,8.667,2012,10,43,26


In [9]:
Data.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Week,DayOfTheMonth
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,150681.0,111248.0,137091.0,134967.0,151432.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0
mean,22.200546,44.260317,15981.258123,136727.915739,60.090059,3.361027,7246.420196,3334.628621,1439.421384,3383.168256,4628.975079,171.201947,7.960289,2010.968591,6.44951,25.826762,15.673131
std,12.785297,30.492054,22711.183519,60980.583328,18.447931,0.458515,8291.221345,9475.357325,9623.07829,6292.384031,5962.887455,39.159276,1.863296,0.796876,3.243217,14.151887,8.753549
min,1.0,1.0,-4988.94,34875.0,-2.06,2.472,0.27,-265.76,-29.1,0.22,135.16,126.064,3.879,2010.0,1.0,1.0,1.0
25%,11.0,18.0,2079.65,93638.0,46.68,2.933,2240.27,41.6,5.08,504.22,1878.44,132.022667,6.891,2010.0,4.0,14.0,8.0
50%,22.0,37.0,7612.03,140167.0,62.09,3.452,5347.45,192.0,24.6,1481.31,3359.45,182.31878,7.866,2011.0,6.0,26.0,16.0
75%,33.0,74.0,20205.8525,202505.0,74.28,3.738,9210.9,1926.94,103.99,3595.04,5563.8,212.416993,8.572,2012.0,9.0,38.0,23.0
max,45.0,99.0,693099.36,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313,2012.0,12.0,52.0,31.0


In [10]:
Data.shape

(421570, 20)

In [11]:
Data.dtypes

Store                     int64
Dept                      int64
Date             datetime64[ns]
Weekly_Sales            float64
IsHoliday                  bool
Type                     object
Size                      int64
Temperature             float64
Fuel_Price              float64
MarkDown1               float64
MarkDown2               float64
MarkDown3               float64
MarkDown4               float64
MarkDown5               float64
CPI                     float64
Unemployment            float64
Year                      int64
Month                     int64
Week                      int64
DayOfTheMonth             int64
dtype: object

# - Business Understanding

# - Prepare Data

In [12]:
Data.isna().sum()

Store                 0
Dept                  0
Date                  0
Weekly_Sales          0
IsHoliday             0
Type                  0
Size                  0
Temperature           0
Fuel_Price            0
MarkDown1        270889
MarkDown2        310322
MarkDown3        284479
MarkDown4        286603
MarkDown5        270138
CPI                   0
Unemployment          0
Year                  0
Month                 0
Week                  0
DayOfTheMonth         0
dtype: int64

In [13]:
Data['CPI'] = Data['CPI'].fillna(Data['CPI'].mean())
Data['Unemployment'] = Data['Unemployment'].fillna(Data['Unemployment'].mean())

#del Data['MarkDown1']
#del Data['MarkDown2']
#del Data['MarkDown3']
#del Data['MarkDown4']
#del Data['MarkDown5']

Data['MarkDown1'] = Data['MarkDown1'].fillna(0)
Data['MarkDown2'] = Data['MarkDown2'].fillna(0)
Data['MarkDown3'] = Data['MarkDown3'].fillna(0)
Data['MarkDown4'] = Data['MarkDown4'].fillna(0)
Data['MarkDown5'] = Data['MarkDown5'].fillna(0)

In [14]:
Data.isna().sum()

Store            0
Dept             0
Date             0
Weekly_Sales     0
IsHoliday        0
Type             0
Size             0
Temperature      0
Fuel_Price       0
MarkDown1        0
MarkDown2        0
MarkDown3        0
MarkDown4        0
MarkDown5        0
CPI              0
Unemployment     0
Year             0
Month            0
Week             0
DayOfTheMonth    0
dtype: int64

In [15]:
Data['Type'].value_counts()

A    215478
B    163495
C     42597
Name: Type, dtype: int64

In [16]:
Type_Cat=pd.get_dummies(Data['Type'])
Data=pd.concat([Data,Type_Cat],axis=1)
Data.drop('Type',axis=1,inplace=True)
Data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Week,DayOfTheMonth,A,B,C
0,1,1,2010-02-05,24924.5,False,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2010,2,5,5,1,0,0
277665,29,5,2010-02-05,15552.08,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
277808,29,6,2010-02-05,3200.22,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
277951,29,7,2010-02-05,10820.05,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
278094,29,8,2010-02-05,20055.64,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0


# - Data Modeling

In [17]:
X=Data.drop(columns=['Weekly_Sales','Date'])
y=Data['Weekly_Sales']
print(X.shape)
print(y.shape)

(421570, 20)
(421570,)


In [18]:
x_train = X.iloc[0:337256] # 80%
x_test = X.iloc[337256:] # 20%
y_train = y.iloc[0:337256] # 80%
y_test = y.iloc[337256:] # 20%
print(type(x_train))

<class 'pandas.core.frame.DataFrame'>


In [19]:
#x_train,x_test,y_train,y_test=train_test_split(X,y,test_size = 0.2,random_state=0)

In [20]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(337256, 20)
(84314, 20)
(337256,)
(84314,)


In [21]:
x_train.head(100)

Unnamed: 0,Store,Dept,IsHoliday,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Year,Month,Week,DayOfTheMonth,A,B,C
0,1,1,False,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2010,2,5,5,1,0,0
277665,29,5,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
277808,29,6,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
277951,29,7,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
278094,29,8,False,93638,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,2010,2,5,5,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267695,28,6,False,206302,49.47,2.962,0.0,0.0,0.0,0.0,0.0,126.442065,13.975,2010,2,5,5,1,0,0
267838,28,7,False,206302,49.47,2.962,0.0,0.0,0.0,0.0,0.0,126.442065,13.975,2010,2,5,5,1,0,0
267981,28,8,False,206302,49.47,2.962,0.0,0.0,0.0,0.0,0.0,126.442065,13.975,2010,2,5,5,1,0,0
268124,28,9,False,206302,49.47,2.962,0.0,0.0,0.0,0.0,0.0,126.442065,13.975,2010,2,5,5,1,0,0


In [22]:
y_train.shape

(337256,)

In [23]:
import numpy as np

x_train = np.asarray(x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)


In [24]:
mea = np.mean(x_train,axis = 0)
std = np.std(x_train,axis = 0)
x_train = (x_train-mea)/std
x_test = (x_test-mea)/std

In [25]:
import tensorflow as tf
from keras.layers import Dropout
from tensorflow.keras.layers import Dense
from keras.layers import GRU
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Sequential

In [27]:
def get_model(params, input_shape):
	model = Sequential()
	model.add(GRU(units=params["gru_units"], return_sequences=True, input_shape=(input_shape, 1)))
	model.add(Dropout(rate=params["dropout"]))

	model.add(GRU(units=params["gru_units"], return_sequences=True))
	model.add(Dropout(rate=params["dropout"]))

	model.add(GRU(units=params["gru_units"], return_sequences=True))
	model.add(Dropout(rate=params["dropout"]))

	model.add(GRU(units=params["gru_units"], return_sequences=False))
	model.add(Dropout(rate=params["dropout"]))

	model.add(Dense(1,activation = "linear"))

	model.compile(loss=params["loss"],
              	optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.003),
              	metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

	return model

In [28]:
params = {
	"loss": "mean_squared_error",
	"optimizer": "adam",
	"dropout": 0.2,
	"gru_units": 138,
	"epochs": 100,
	"batch_size": 128,
	"es_patience" : 10
}

model = get_model(params=params, input_shape=x_train.shape[1])

In [29]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',
                                           	mode='min', 
patience=params["es_patience"])

In [None]:
model.fit(
	x_train,
	y_train,
	validation_data=(x_test, y_test),
	epochs=params["epochs"],
	batch_size=params["batch_size"],
	verbose=1, callbacks=[es_callback]
)

Epoch 1/100