# Walmart Stores Sales Forecasting
**Kaan Akkartal**

# 1. Library and Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from sklearn.linear_model import LinearRegression
from sklearn import tree

In [None]:
features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')

# 2. Data Observation

Four different tables are added. When we look at the beginning and end, we observe that it begins with 2010-02-05 and ends with 2013-07-26.

In [None]:
features.head()

In [None]:
features.tail()

In [None]:
features.shape

In [None]:
features.info()

There are missing values in Markdown, CPI and Unemployment variables.

Below the amount of missing values are calculated. More than half of the observations are missing in Markdown variables. 

In [None]:
features.isnull().sum()

In [None]:
features.describe()

As in "Features" table, "Train" table starts with 2010-02-05 but it ends with 2012-10-26. The amount of sales and holiday information stand on the table. In addition there are store and departmant values. 

In [None]:
train.head()

In [None]:
train.tail()

There are no missing values in "Train" table.

In [None]:
train.info()

"Test" table begins where "Train" table ends in terms of date. So it starts with "2012-11-02". 

In [None]:
test.head()

In [None]:
test.tail()

No missing values in "test" table. 

In [None]:
test.info()

In [None]:
stores.head()

In [None]:
stores.tail()

No missing values in "stores" table.

In [None]:
stores.info()

# 3. Data Manipulation

In [None]:
features

In [None]:
train

In [None]:
df = pd.merge(train, features, how = "left")
df

In [None]:
df = pd.merge(df, stores, how = "left")
df

In [None]:
df.isnull().sum()

In [None]:
df.Date = pd.to_datetime(df.Date)

In [None]:
df.set_index(keys = "Date", inplace = True)

In [None]:
df.head()

In [None]:
df.index

Markdown variables are dropped becuase more than half of the observations are missing.

In [None]:
df.isnull().sum()

In [None]:
df.drop(axis = 1, columns = ["MarkDown1", "MarkDown2","MarkDown3","MarkDown4", "MarkDown5"], inplace = True)
df.head()

In [None]:
df.Store = pd.Categorical(df.Store)
df.Dept = pd.Categorical(df.Dept)

In [None]:
df.dtypes

In [None]:
df.describe()

# 4. Data Visualization

In [None]:
ax = sns.scatterplot(x = df.index , y ="Weekly_Sales", data = df)
ax.set_xlim(df.index.min(), df.index.max())
for item in ax.get_xticklabels():
    item.set_rotation(90);

In [None]:
sns.boxplot(x = "Weekly_Sales", data = df);

In [None]:
df.Weekly_Sales[df.Weekly_Sales == max(df.Weekly_Sales)]

Top 5 sales take place in Thank's Giving week.

In [None]:
top5_sales = list(df.Weekly_Sales.sort_values(ascending=False).head(5).values)
top5_sales

In [None]:
df.loc[df.Weekly_Sales.isin(top5_sales)].sort_values(by = "Weekly_Sales", ascending = False)

Boxplotting for each variable. 

In [None]:
df[['Store', 'Dept', 'Weekly_Sales', 'IsHoliday', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment', 'Type', 'Size']].plot(kind='box',subplots=1,layout=(3,5),figsize=(14,12))

Because of the possible outlier values "Weekly_Sales" and "Unemployment" variables are observed. Unlike "Weekly_Sales", "Unemployment" values are close to normal distribution. 

In [None]:
plt.xlim(0, 150000)
rcParams['figure.figsize'] = 1,6.0
sns.distplot(df.Weekly_Sales, kde = False);

In [None]:
rcParams['figure.figsize'] = 5,6.0
sns.distplot(df.Unemployment, kde = False);

In [None]:
rcParams['figure.figsize'] = 14,5
sns.lineplot(x = df.index, y = "Weekly_Sales", data = df);

Sales are increased dramaticly on year ends. Christmas and Thank's Giving affect those values directly.

### **Barplot and lineplot diagrams for each variable**

In [None]:
rcParams['figure.figsize'] = 16.0,6.0
sns.barplot(x = "Store", y = "Weekly_Sales", data = df);

In [None]:
rcParams['figure.figsize'] = 16.0,6.0
sns.barplot(x = "Dept", y = "Weekly_Sales", data = df);

In [None]:
rcParams['figure.figsize'] = 10,6.0
sns.lineplot(x = "Fuel_Price", y = "Weekly_Sales", data = df);

In [None]:
rcParams['figure.figsize'] = 10,6.0
sns.lineplot(x = "CPI", y = "Weekly_Sales", data = df);

In [None]:
rcParams['figure.figsize'] = 10,6.0
sns.lineplot(x = "Unemployment", y = "Weekly_Sales", data = df);

In [None]:
rcParams['figure.figsize'] = 16,6.0
sns.lineplot(x = df.index, y = "Weekly_Sales", hue = "Type", data = df);

Sales rate by store type: (A>B>C). 

In [None]:
sns.boxplot(x = "Type", y = "Size", data = df);

Store size is also aligned with A>B>C. So it is correlated to sales rate.

In [None]:
sns.lineplot(x = "Size" , y = "Weekly_Sales", hue = "Type", data = df);

In [None]:
rcParams['figure.figsize'] = 10,6.0
sns.lineplot(x = "Temperature" , y = "Weekly_Sales", data = df);

In [None]:
sns.lineplot(x = df.index.month, y = "Weekly_Sales", data = df);

In [None]:
sns.lineplot(x = df.index.month, y = "Weekly_Sales", hue = "IsHoliday", data = df);

# 5. Correlation

In [None]:
sns.heatmap(df.corr(), vmin = -1, vmax = 1, annot = True);

In [None]:
df.dtypes

In [None]:
df.IsHoliday = pd.Categorical(df.IsHoliday)
df.Type = pd.Categorical(df.Type)

In [None]:
df.dtypes

# 6. Modelling

### Model Preparation

In [None]:
df

In [None]:
df_X = df.drop("Weekly_Sales", axis = 1)
df_X

In [None]:
df_Y = df.Weekly_Sales
df_Y

In [None]:
df_X = pd.get_dummies(df_X, drop_first = True)

In [None]:
df_X.reset_index(inplace = True)

In [None]:
df_X = df_X.iloc[:,1:]
df_X

In [None]:
from sklearn.model_selection import train_test_split
df_X_train, df_X_validation, df_Y_train, df_Y_validation = train_test_split(df_X , df_Y, test_size = 0.2)

In [None]:
df_X_train.head()

In [None]:
df_X_validation

### ML Algorithms 

**LINEAR REGRESSION**

In [None]:
lm = LinearRegression()

In [None]:
model = lm.fit(df_X_train, df_Y_train)

In [None]:
lm.score(df_X_validation, df_Y_validation)

In [None]:
lm.score(df_X_train, df_Y_train)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [None]:
cross_val_score(model, df_X_train, df_Y_train, cv = 10, scoring = "r2").mean()

In [None]:
np.sqrt(-cross_val_score(model, 
                df_X_train, 
                df_Y_train, 
                cv = 10, 
                scoring = "neg_mean_squared_error")).mean()

In [None]:
y_pred = lm.predict(df_X_validation)

In [None]:
y_pred

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mse = mean_squared_error(df_Y_validation, y_pred)
mse

In [None]:
rmse = np.sqrt(mse)
rmse

In [None]:
df_Y.values.mean()

**DECISION TREE**

In [None]:
clf = tree.DecisionTreeRegressor()

In [None]:
model = clf.fit(df_X_train, df_Y_train)

In [None]:
clf.score(df_X_validation, df_Y_validation)

In [None]:
cross_val_score(model, df_X_train, df_Y_train, cv = 10, scoring = "r2").mean()

In [None]:
y_pred = clf.predict(df_X_validation)

In [None]:
mse = mean_squared_error(df_Y_validation, y_pred)
mse

In [None]:
rmse = np.sqrt(mse)
rmse

 **XGBoost**

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_model = XGBRegressor().fit(df_X_train, df_Y_train)

In [None]:
xgb_model.score(df_X_validation, df_Y_validation)

In [None]:
y_pred = xgb_model.predict(df_X_validation)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
np.sqrt(mean_squared_error(df_Y_validation, y_pred))

 **Light GBM**

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgbm = LGBMRegressor()
lgbm_model = lgbm.fit(df_X_train, df_Y_train)

In [None]:
lgbm_model.score(df_X_validation, df_Y_validation)

In [None]:
y_pred = lgbm_model.predict(df_X_validation)

In [None]:
rmse = np.sqrt(mean_squared_error(df_Y_validation, y_pred))
rmse

**Model Validation and Tuning**

In [None]:
lgbm_grid = {'n_estimators': [20, 40, 100, 200, 500, 1000],
             'learning_rate': [0.1, 0.01, 0.5]}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
lgbm = LGBMRegressor()
lgbm_cv_model = GridSearchCV(lgbm, lgbm_grid, cv=5, n_jobs = -1, verbose = 2)

In [None]:
lgbm_cv_model.fit(df_X_train, df_Y_train)

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm_tuned = LGBMRegressor(learning_rate = 0.5, n_estimators = 1000)

lgbm_tuned = lgbm_tuned.fit(df_X_train, df_Y_train)

In [None]:
y_pred = lgbm_tuned.predict(df_X_validation)

In [None]:
np.sqrt(mean_squared_error(df_Y_validation, y_pred))

In [None]:
lgbm_tuned.score(df_X_validation, df_Y_validation)

Logistic Regression, Decision Tree, XGBoost and Light GBM models are applied. Best rmse value observed in tuned Light GBM model (4645). R2 is 96%. Therefore, final prediction will be done with tuned Light GBM model.

### Prediction

"Test", "Features" and "Stores tables are merged. 

In [None]:
test.head()

In [None]:
features.head()

In [None]:
stores.head()

In [None]:
df_test = pd.merge(test, features, how = "inner")
df_test

In [None]:
df_test = pd.merge(df_test, stores, how = "inner")
df_test

In [None]:
df_test.info()

Markdown variables were not in the model, so they are also dropped in "test" table. 

In [None]:
df_test.drop(axis = 1, columns = ["MarkDown1", "MarkDown2","MarkDown3","MarkDown4", "MarkDown5"], inplace = True)
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.Date = pd.to_datetime(df_test.Date)
df_test.set_index(keys = "Date", inplace = True)
df_test.head()

In [None]:
df_test.IsHoliday = pd.Categorical(df_test.IsHoliday)
df_test.Store = pd.Categorical(df_test.Store)
df_test.Dept = pd.Categorical(df_test.Dept)
df_test.Type = pd.Categorical(df_test.Type)

In [None]:
df_test.dtypes

In [None]:
df_test.describe()

In [None]:
(115064-76902)/115064

Missing value rate of CPI and Unemployment variables are too much. Since they are included in the model, the missing observations are deleted.  

In [None]:
df_test

In [None]:
df_test.dropna(inplace = True)

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
dftest_X = df_test 

In [None]:
dftest_X.head()

In [None]:
dftest_X.reset_index(inplace = True)

In [None]:
dftest_X.head()

In [None]:
dftest_X = dftest_X.iloc[:,1:]

In [None]:
dftest_X.head()

In [None]:
dftest_X = pd.get_dummies(dftest_X, drop_first = True)

In [None]:
predictions = lgbm_tuned.predict(dftest_X)
predictions

In [None]:
predictions.dtype

In [None]:
predictions = pd.Series(predictions)

In [None]:
predictions.describe()

In [None]:
df_test

In [None]:
df_test["Prediction_Sales"] = predictions

In [None]:
df_test

In [None]:
df_test.describe()

In [None]:
sns.lineplot(x = df_test.Date, y = "Prediction_Sales", data = df_test);