In [None]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#load dataset
train = pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/train.csv.zip")
test = pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/test.csv.zip")
store = pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/stores.csv")
feature = pd.read_csv("../input/walmart-recruiting-store-sales-forecasting/features.csv.zip")

In [None]:
#find out the shape of dataset
print('train: ', train.shape)
print('feature: ', feature.shape)
print('stores ', store.shape)


In [None]:
#Merge three csv files using inner join.
df = train.merge(feature, on=['Store', 'Date'], how='inner').merge(store, on=['Store'], how='inner')
print(df.shape)
df.head()

In [None]:
#Remove additional IsHoliday column (IsHoliday_y) and rename IsHoliday_x column to IsHoliday.
df = df.drop(['IsHoliday_y'], axis=1)
df = df.rename(columns={'IsHoliday_x':'IsHoliday'})
df.columns


In [None]:
#shape of dataframe created
df.shape

In [None]:
#check for null values
df.isnull().sum()

In [None]:
#summary of dataframe
df.describe()

In [None]:
#nformation of dataframe regarding data type and null values 
df.info()

# Exploratory Data Analysis

In [None]:
#45 stores weekly sales
fig = plt.figure(figsize =(10, 8))
plt.scatter(df.Store, df.Weekly_Sales)
plt.xlabel('Store')
plt.ylabel('Weekly Sales')
plt.title('45 stores weekly sales')

In [None]:
#weekly sales of department and holiday
data= pd.concat([df['Dept'], df['Weekly_Sales'], df['IsHoliday']], axis=1)
plt.figure(figsize=(22,6))
plt.title('Box Plot of Weekly Sales by Department and Holiday')
fig = sns.boxplot(x='Dept', y='Weekly_Sales', data=data, showfliers=False, hue="IsHoliday")

In [None]:
#Plot of Weekly Sales by Store Number and Store Type.
data_8 = pd.concat([df['Store'], df['Weekly_Sales'], df['Type']], axis=1)
plt.figure(figsize=(20,6))
plt.title('Box Plot of Weekly Sales by Store Number and Store Type')
fig = sns.boxplot(x='Store', y='Weekly_Sales', data=data_8, showfliers=False, hue='Type')

In [None]:
#Saales as per dept
fig = plt.figure(figsize =(20,6))
sns.barplot(x='Dept', y='Weekly_Sales', data=df)
plt.title('Weekly sales as per department')

In [None]:
#holiday vs weekly sales
sns.barplot(x='IsHoliday', y='Weekly_Sales', data=df)
plt.title('Holiday vs Weekly Sales')
plt.show()

In [None]:
#store type vs weekly sales 
sns.barplot(x='Type', y='Weekly_Sales', data=df)
plt.title('Which type of store contribute more to Weekly Sales?')
plt.show()

In [None]:
#size vs weekly sales
fig = plt.figure(figsize =(26,10))
sns.barplot(x='Size', y='Weekly_Sales', data=df)
plt.title('Size vs Weekly Sales')
plt.show()

In [None]:
#weekly sales by holiday & dept
data= pd.concat([df['Dept'], df['Weekly_Sales'], df['IsHoliday']], axis=1)
plt.figure(figsize=(20,6))
plt.title('Box Plot of Weekly Sales by Department and Holiday')
fig = sns.barplot(x='Dept', y='Weekly_Sales', data=data)

In [None]:
# First we need to get Year, Month and Week columns from Date column.
df.Date = pd.to_datetime(df.Date)
df['Year']  = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Week']  = df.Date.dt.week

#Plot of Weekly Sales and Week.
plt.figure(figsize=(15,5))
plt.title('Weekly Sales by Week')
plt.xlabel('Week')
plt.ylabel('Weekly Sales')
plt.plot(df.Week,df.Weekly_Sales)
plt.show()

In [None]:
df.head()

# **Data preprocessing**

In [None]:
df.isnull().sum()

In [None]:
# Remove rows with null values in all columns
df.dropna(axis=0, how="all", inplace=True)
# Remove all rows with null values in all rows
df.dropna(axis=1, how="all", inplace=True)
# Fill missing values with 0
df=df.fillna(0)

In [None]:
#check for null values
df.isnull().sum()

In [None]:
# Remove negative values as sales cannot be negative values.
df= df[df['Weekly_Sales'] >= 0]
df.shape

In [None]:
#heatmap to find out correlation b/w attributes
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(),annot=True)

# Time Series Modelling

In [None]:
# It is necessary to have the date columns present in the dataset to be available in the datetime format as in ARIMA model it is required to see the sales values in date-wise fashion.
# Hence converting the string formatted Date into datetime format.
df.Date = pd.to_datetime(df.Date,format='%Y-%m-%d')
df.index = df.Date
df = df.drop('Date', axis=1)
df = df.resample('MS').mean() # Resmapling the time series data with month starting first.

In [None]:
#weekly sales with respect to date as index
df['Weekly_Sales'].plot(figsize=(12,5))

In [None]:
# Train-Test splitting of time series data
train_data = df[:int(0.7*(len(df)))]
test_data = df[int(0.7*(len(df))):]
print('Train data:\n')
print(train_data.tail())
print('='*50,'\n')
print('Test data:\n')
print(test_data.head())
print('Train:', train_data.shape)
print('Test:', test_data.shape)


In [None]:
# ARIMA takes univariate data.
train_data = train_data['Weekly_Sales']
test_data = test_data['Weekly_Sales']
# Plot of Weekly_Sales with respect to years in train and test.
train_data.plot(figsize=(20,8), title= 'Weekly_Sales', fontsize=14)
test_data.plot(figsize=(20,8), title= 'Weekly_Sales', fontsize=14)
plt.show()


In [None]:
# Decomposition of time series data. It is necessary to see whether the trend, seasonality and residual are present in data or not.
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['Weekly_Sales'], model='additive')
plt.figure(figsize=(20,6))
result.plot()
plt.show()

In [None]:
# A check of sationarity of data using Dicky-Fuller test.
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['Weekly_Sales'])
print('ADF Statistic: {}'.format(result[0]))
print('p-value: {}'.format(result[1]))
print('Critical Values:')
for key, value in result[4].items():
    print('\t{}: {}'.format(key, value))

In [None]:
# install pmdarima library to build auto-arima model.
!pip install pmdarima

In [None]:
# auto_arima model on train data.
from pmdarima.arima import auto_arima
model_auto_arima = auto_arima(train_data, trace=True,start_p=0, start_q=0, start_P=0, start_Q=0,
                  max_p=10, max_q=10, max_P=10, max_Q=10, seasonal=True,
                  stepwise=False, suppress_warnings=True, D=1, max_D=10,
                  error_action='ignore',approximation = False)
model_auto_arima.fit(train_data)


In [None]:
# Predictthe test values
forecast = model_auto_arima.predict(n_periods=len(test_data))
forecast = pd.DataFrame(forecast,index = test_data.index,columns=['Prediction'])
plt.figure(figsize=(20,6))
#plt.title('Prediction of Weekly Sales using Auto ARIMA model', fontsize=20)
#plt.plot(train_data, label='Train')
#plt.plot(test_data, label='Test')
#plt.plot(forecast, label='Prediction using ARIMA Model')
#plt.legend(loc='best')
#plt.xlabel('Date', fontsize=14)
#plt.ylabel('Weekly Sales', fontsize=14)
#plt.show()
print(forecast.head())
print(test_data.head())


In [None]:
# Performance metric for ARIMA model -MSE/RMSE
import math
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('Mean Squared Error (MSE) of ARIMA: ', mean_squared_error(test_data, forecast))
print('Root Mean Squared Error (RMSE) of ARIMA: ', math.sqrt(mean_squared_error(test_data, forecast)))
print('Mean Absolute Deviation (MAD) of ARIMA: ', mean_absolute_error(test_data, forecast))


# Machine Learning Models

In [None]:
#merge the 3 csv files to make a test dataset
test_stores = pd.merge(test,store)
test = pd.merge(test_stores,feature)
test.head()

In [None]:
#Numeric Encoding
# Converting Categorical Variable 'Type' & 'IsHoliday' into Numerical Variables.
code_numeric = {"A": 1,"B": 2,"C": 3,False: 0,True: 1}
df = df.applymap(lambda s: code_numeric.get(s) if s in code_numeric else s)
test = test.applymap(lambda s: code_numeric.get(s) if s in code_numeric else s)

In [None]:
#Train-test split data
from sklearn.model_selection import train_test_split
y = df['Weekly_Sales']
X = df.drop(['Weekly_Sales','Date'], axis=1) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Train:Test = 70:30 splitting.
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.3) #Train:CV = 70:30 splitting.

In [None]:
#Random Forest Regressor model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
#Evaluation parameters of Random Forest Regressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("Random Forest Regressor")
print("MAE:",mean_absolute_error(y_test,y_pred))
print("MSE:",mean_squared_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("r2_Score:",r2_score(y_test,y_pred))

In [None]:
#Decision Tree Regressor model
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Regressor")
print("MAE:",mean_absolute_error(y_test,y_pred_dt))
print("MSE:",mean_squared_error(y_test,y_pred_dt))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred_dt)))
print("r2_Score:",r2_score(y_test,y_pred_dt))

In [None]:
#knn Regressor model
knn = KNeighborsRegressor(n_neighbors=11)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("KNN Regressor")
print("MAE:",mean_absolute_error(y_test,y_pred_knn))
print("MSE:",mean_squared_error(y_test,y_pred_knn))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred_knn)))
print("r2_Score:",r2_score(y_test,y_pred_knn))

In [None]:
#Linear Regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Linear Regression")
print("MAE:",mean_absolute_error(y_test,y_pred_lr))
print("MSE:",mean_squared_error(y_test,y_pred_lr))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred_lr)))
print("r2_Score:",r2_score(y_test,y_pred_lr))

In [None]:
#check for the null values in test data
test.isnull().sum()

In [None]:
#fill the null values with mean value
CPI_mean = test['CPI'].mean()
Unemp_mean = test['Unemployment'].mean()
test['CPI'] = test['CPI'].fillna(CPI_mean)
test['Unemployment'] = test['Unemployment'].fillna(Unemp_mean)
test = test.fillna(0)
# Converting Date to datetime
test['Date'] = pd.to_datetime(test['Date'])
# Extract date features
test['Date_dayofweek'] = test['Date'].dt.dayofweek
test['Date_month'] = test['Date'].dt.month 
test['Date_year'] = test['Date'].dt.year
test.head()

In [None]:
#Exclude Date as it throws error while making prediction.
test_kaggle = test.loc[:, test.columns != 'Date']
model_rf = RandomForestRegressor(n_estimators=80).fit(X_train,y_train) # Fit the model with original train data.
y_pred = model_rf.predict(test_kaggle) # Predict the final test data that Kaggle has provided.

In [None]:
# Weekly Sales Prediction is the final file to be submitted in Kaggle.
submission = pd.DataFrame({
        "Id": test.Store.astype(str)+'_'+test.Dept.astype(str)+'_'+test.Date.astype(str), # Preparing in the format provided in samplesubmission.csv.
        "Weekly_Sales": y_pred # This is predicetd Weekly Sales on final test data using Random Forest regression.
    })
submission.to_csv('Weekly Sales Prediction.csv', index=False) # Final submission.