This is part 2 of predicting Rossmann Sales. It only deals with finding the best models for Rossmann Sales prediction.


To know more about the preprocessing steps, checkout https://www.kaggle.com/amithanayak/cleaning-feature-engg-eda-rossmann-sales

In [None]:
#import required libraries
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv("../input/cleaning-feature-engg-eda-rossmann-sales/final_RossmannSales.csv", index_col=0)

In [None]:
df.head()

In [None]:
#encoding State variables
p=0
for j in df["State"].unique():
   df["State"]=np.where(df["State"]==j,p,df["State"])
   p=p+1
df["State"]=df["State"].astype(int)

In [None]:
#Splitting of time series dataset
df = df.sort_values(by="Date")

In [None]:
df.shape

In [None]:
X_train=df[:int(df.shape[0]*80/100)].drop(columns=["Date","Sales","CompetitionOpenSince"])
Y_train=df[:int(df.shape[0]*80/100)]["Sales"]
X_test=df[int(df.shape[0]*80/100):].drop(columns=["Date","Sales","CompetitionOpenSince"])
Y_test=df[int(df.shape[0]*80/100):]["Sales"]

In [None]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

## Using Linear Regression to predict 'Sales'

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_reg_model=LinearRegression()
linear_reg_model.fit(X_train,Y_train)

In [None]:
y_pred=linear_reg_model.predict(X_test)
pred_df=pd.DataFrame({'Predictions':y_pred,'Actual':Y_test})
pred_df=pred_df.reset_index(drop=True)
pred_df.head()

In [None]:
plt.figure(figsize=(10,10))
pred_df["Actual"][:25,].plot.line()
pred_df["Predictions"][:25,].plot.line()
plt.legend()
plt.show()

In [None]:
linear_reg_model.score(X_test,Y_test)

## Using Lasso Regression to predict 'Sales'

In [None]:
from sklearn.linear_model import Lasso
Lasso_reg_model = Lasso(alpha=2)
Lasso_reg_model.fit(X_train,Y_train)

In [None]:
y_pred=Lasso_reg_model.predict(X_test)
pred_df=pd.DataFrame({'Predictions':y_pred,'Actual':Y_test})
pred_df=pred_df.reset_index(drop=True)
pred_df.head()

In [None]:
plt.figure(figsize=(10,10))
pred_df["Actual"][:25,].plot.line()
pred_df["Predictions"][:25,].plot.line()
plt.legend()
plt.show()

In [None]:
Lasso_reg_model.score(X_test,Y_test)

## Using Gradient Boosted Decision Trees to predict 'Sales'

In [None]:
import lightgbm as lgb
base_lgb_model = lgb.LGBMRegressor(n_leaves=50,
                               n_estimators=700, 
                               max_depth=-1,  
                               learning_rate=0.3, #0.1
                               subsample=1,
                               colsample_bytree=0.8,
                               reg_alpha=0.1,
                               reg_lambda=1)

base_lgb_model.fit(X_train, Y_train)

In [None]:
y_pred=base_lgb_model.predict(X_test)
pred_df=pd.DataFrame({'Predictions':y_pred,'Actual':Y_test})
pred_df=pred_df.reset_index(drop=True)
pred_df.head()

In [None]:
plt.figure(figsize=(10,10))
pred_df["Actual"][:25,].plot.line()
pred_df["Predictions"][:25,].plot.line()
plt.legend()
plt.show()

In [None]:
base_lgb_model.score(X_test,Y_test)

## Time Series Analysis

In [None]:
import random
time_features=df[["Year","Month","Day",	"Week",	"Season", "Promo", "Open","Store","Sales","Date"]]
#Choose a random store
x=random.randint(1,1115)
print(x)
time_features=time_features.loc[time_features['Store'] == x]

In [None]:
plt.figure(figsize=(15,15))
sns.set(style="whitegrid")
sns.lineplot(x="Month", y="Sales", data=time_features.loc[time_features['Year'] == 2013])

In [None]:
plt.figure(figsize=(15,15))
sns.set(style="whitegrid")
sns.lineplot(x="Month", y="Sales", data=time_features.loc[time_features['Year'] == 2014])

In [None]:
plt.figure(figsize=(15,15))
sns.set(style="whitegrid")
sns.lineplot(x="Month", y="Sales", data=time_features.loc[time_features['Year'] == 2015])

as there are no clear cyclic trends of Sales against Time series, making RNN seems like a moot point

In [None]:
import tensorflow as tf
import keras
import keras.backend as kb

In [None]:
time_target=time_features["Sales"]
time_features=time_features.drop(columns=["Sales","Date"])

In [None]:
X_train=time_features[:int(time_features.shape[0]*80/100)]
Y_train=time_target[:int(time_target.shape[0]*80/100)]
X_test=time_features[int(time_features.shape[0]*80/100):int(df.shape[0]*95/100)]
Y_test=time_target[int(time_target.shape[0]*80/100):int(df.shape[0]*95/100)]
X_val=time_features[int(time_features.shape[0]*95/100):]
Y_val=time_target[int(time_target.shape[0]*95/100):]

In [None]:
time_model=keras.models.Sequential()
time_model.add(keras.layers.Embedding(input_dim=2500, output_dim=4, input_length=8))
time_model.add(keras.layers.LSTM(256, return_sequences=True))
time_model.add(keras.layers.LSTM(256, return_sequences=True))
time_model.add(keras.layers.Dense(64, activation="relu")) 
time_model.add(keras.layers.Dense(1))

In [None]:
time_model.compile(optimizer="adam",metrics=["mean_absolute_error"],loss="mean_squared_error")

In [None]:
hist=time_model.fit(X_train,Y_train,batch_size=50,epochs=30,validation_data=(X_val,Y_val))

In [None]:
results=time_model.evaluate(X_test,Y_test)
for i in range(len(time_model.metrics_names)):
  print(time_model.metrics_names[i],":",results[i])

In [None]:
plt.plot(hist.history['loss'][:30])
plt.plot(hist.history['val_loss'][:30])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()