In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

In [None]:
# Load the data
train_filepath = "../input/store-sales-time-series-forecasting/train.csv"
oil_filepath = "../input/store-sales-time-series-forecasting/oil.csv"
holidays_filepath = "../input/store-sales-time-series-forecasting/holidays_events.csv"
transactions_filepath = "../input/store-sales-time-series-forecasting/transactions.csv"
stores_filepath = "../input/store-sales-time-series-forecasting/stores.csv"



In [None]:
# Read the data
train_data = pd.read_csv(train_filepath, index_col="id")
oil_data = pd.read_csv(oil_filepath, index_col="date", parse_dates=True)
holidays_data = pd.read_csv(holidays_filepath, index_col="date", parse_dates=True)
transactions_data = pd.read_csv(transactions_filepath, index_col="date")



In [None]:
# Convert all 'date' columns into Panda's format:
train_data['date'] = pd.to_datetime(train_data['date'], format = "%Y-%m-%d")
oil_data['date'] = pd.to_datetime(oil_data['date'], format = "%Y-%m-%d")
holidays_data['date'] = pd.to_datetime(holidays_data['date'], format = "%Y-%m-%d")
transactions_data['date'] = pd.to_datetime(transactions_data['date'], format ="%Y-%m-%d")

In [None]:
train_data.head()

In [None]:
train_data.tail()

In [None]:
oil_data.head()


# **Data Preparation with EDAs**

The following EDA provides an overview of the data and data relationships 

In [None]:
# Display Sales per Product Family

plt.figure(figsize=(16,6))
plt.title('Sales per product family')
sns.barplot(x=train_data.sales, y=train_data.family)
plt.xlabel('Sales')
plt.ylabel('Product Family')



In [None]:
# Display changes in Oil Price over time
plt.figure(figsize=(16,6))
sns.lineplot(data=oil_data.dcoilwtico, label="Oil price")
plt.title('Ecuador Oil Price')
plt.xlabel('Date')
plt.ylabel('Price')

In [None]:
plt.figure(figsize=(16,6))
sns.scatterplot(x=train_data['onpromotion'], y=train_data['sales'])
plt.title('Promotion and Sales from 2013 to 2017')
plt.xlabel('Discount Promotion')
plt.ylabel('Sales')


Prepare relevant data for Regression and Machine Learning.
Address NaN cell values.
Add dummy variable for holildays.

In [None]:
#Merge Data
train_data_m1 = train_data.merge(oil_data, on = 'date', how = 'left')
train_data_m1

In [None]:
train_data_m1.head()

In [None]:
train_data_m1.tail()

In [None]:
# Daily Oil Price
# Group the data set by date and oil price per day
oil_price = train_data_m1.groupby('date').dcoilwtico.mean()
oil_price

In [None]:
# Daily Average Sales
# grouping data set by date and deriving ave sales per day
ave_sales = train_data_m1.groupby('date').sales.mean()
ave_sales

In [None]:
#converting data to data frame
ave_sales.to_frame()

In [None]:
#converting data to data frame
oil_price.to_frame()

In [None]:
# CONCAT to merge columns with similar id
ave_oil_sales = pd.concat([ave_sales, oil_price], axis=1)
ave_oil_sales

In [None]:
#filling NaN with values above and below the cell
ave_oil_sales['dcoilwtico'] = ave_oil_sales['dcoilwtico'].fillna(method='ffill')
ave_oil_sales['dcoilwtico'] = ave_oil_sales['dcoilwtico'].fillna(method='bfill')
ave_oil_sales

**Diagnostics of Variables**

In [None]:
#Ave Sales Diagnostics

import statsmodels.api as sm
from scipy.stats import norm
import pylab

ave_sales = norm.rvs(size=1000)
sm.qqplot(ave_sales, line='45')
pylab.show()

from statsmodels.stats.diagnostic import lilliefors
ave_sales = np.random.normal(loc = 20, scale = 5, size=150)
statistic,pvalue = lilliefors(ave_sales)
print('statistic=%.3f, p=%.3f\n' %  (statistic, pvalue))
if pvalue > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')
        

*Lilliefors Test for Normality*

The Lilliefors test is a normality test based on the Kolmogorov–Smirnov test. As all the above methods, this test is used to check if the data come from a normal distribution.

If the p-value ≤ 0.05, then we reject the null hypothesis i.e. we assume the distribution of our variable is not normal/gaussian.
If the p-value > 0.05, then we fail to reject the null hypothesis i.e. we assume the distribution of our variable is normal/gaussian.

In [None]:
oil_price = oil_price.fillna(method='ffill')
oil_price = oil_price.fillna(method='bfill')
oil_price


In [None]:
#Oil Price Diagnostics

oil_price = norm.rvs(size=1000)
sm.qqplot(oil_price, line='45')
pylab.show()

oil_price = np.random.normal(loc = 20, scale = 5, size=150)
statistic,pvalue = lilliefors(oil_price)
print('statistic=%.3f, p=%.3f\n' %  (statistic, pvalue))
if pvalue > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

In [None]:
# add dummy to holidays

holidays_data['Dummy Holiday'] = '1'
holidays_data

In [None]:
#"groupby" Holidays_data by Dummy Holiday

dummy_holidays = holidays_data["Dummy Holiday"]
dummy_holidays

In [None]:
dummy_holidays.to_frame()

In [None]:
final_df = ave_oil_sales.merge(dummy_holidays, on = 'date', how = 'left')
final_df

In [None]:
#replace NaN with 0

final_df['Dummy Holiday'] = final_df['Dummy Holiday'].fillna(0)
final_df

In [None]:
# add 'on promotion' to final_df
# onpromotion has number of products on sale that  day

ave_promotion = train_data_m1.groupby('date').onpromotion.mean()
ave_promotion

In [None]:
ave_promotion.to_frame()

In [None]:
#Use merge to add a column of data to the set

final_df = final_df.merge(ave_promotion, on = 'date', how = 'left')
final_df

In [None]:
# Ave_promotion Diagnostics


ave_promotion = norm.rvs(size=1000)
sm.qqplot(ave_promotion, line='45')
pylab.show()

ave_promotion = np.random.normal(loc = 20, scale = 5, size=150)
statistic,pvalue = lilliefors(ave_promotion)
print('statistic=%.3f, p=%.3f\n' %  (statistic, pvalue))
if pvalue > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

In [None]:
import numpy as np
import pylab
import scipy.stats as stats
import seaborn as sns

ave_promotion = np.random.normal(loc = 20, scale = 5, size = 150)
ax = sns.distplot(ave_promotion)

In [None]:
oil_price = np.random.normal(loc = 20, scale = 5, size = 150)
ax = sns.distplot(oil_price)

In [None]:
ave_sales = np.random.normal(loc = 20, scale = 5, size = 150)
ax = sns.distplot(ave_sales)

In [None]:
#Regression Plot: Overall Avereage Sales and Explanatory Variables
plt.figure(figsize=(16,6))
plt.title('Average Sales and Products on Promotion')
sns.regplot(x=final_df['onpromotion'], y=final_df['sales'])
plt.xlabel('Average Products on Promotion')
plt.ylabel('Average Sales')

In [None]:
plt.figure(figsize=(16,6))
plt.title('Average Sales and Products on Promotion')
sns.regplot(x=final_df['dcoilwtico'], y=final_df['sales'])
plt.xlabel('Oil Price')
plt.ylabel('Average Sales')

Narrow the analysis on a specific product group - Grocery Sales

In [None]:
#Exploring the correlation of numerical features
plt.figure(figsize=(5,3), dpi=150)
sns.heatmap(final_df.corr(), annot=True)

In [None]:
#Group by multiple columns and averaging sales
ave_family_sales = train_data_m1.groupby(["date","family"]).sales.mean()
ave_family_sales

In [None]:
ave_family_sales.to_frame()

In [None]:
#Extracting Select Data from a DF

grocery_sales = train_data_m1.loc[train_data_m1.family == 'GROCERY I']
grocery_sales

In [None]:
# Average Grocery I Sales, grouped by date
ave_grocery_sales = grocery_sales.groupby('date').sales.mean()
ave_grocery_sales

In [None]:
ave_grocery_sales.to_frame()

In [None]:
# Average Grocery I Sales, grouped by date
oil_prices = grocery_sales.groupby('date').dcoilwtico.mean()
oil_prices

In [None]:
oil_ave_grocery_sales = pd.concat([ave_grocery_sales, oil_prices], axis = 1)
oil_ave_grocery_sales

In [None]:
oil_ave_grocery_sales['dcoilwtico'] = oil_ave_grocery_sales['dcoilwtico'].fillna(method='ffill')
oil_ave_grocery_sales['dcoilwtico'] = oil_ave_grocery_sales['dcoilwtico'].fillna(method='bfill')
oil_ave_grocery_sales

In [None]:
# Merge holiday dummy data to the set

grocery_final_df = oil_ave_grocery_sales.merge(dummy_holidays, on = 'date', how = 'left')
grocery_final_df

In [None]:
#replace NaN with 0

grocery_final_df['Dummy Holiday'] = grocery_final_df['Dummy Holiday'].fillna(0)
grocery_final_df

In [None]:
#derive onpromotion under Grocery I

ave_grocery_promo = grocery_sales.groupby('date').onpromotion.mean()
ave_grocery_promo

In [None]:
grocery_final_df = grocery_final_df.merge(ave_grocery_promo, on = 'date', how = 'left')
grocery_final_df



In [None]:
#Regression Plot: Grocery Sales and Explanatory Variables
plt.figure(figsize=(16,6))
plt.title('Average Grocery Sales and Products on Promotion')
sns.regplot(x=grocery_final_df['onpromotion'], y=grocery_final_df['sales'])
plt.xlabel('Average Products on Promotion')
plt.ylabel('Average Sales')

In [None]:
plt.figure(figsize=(16,6))
plt.title('Average Grocery Sales and Products on Promotion')
sns.regplot(x=grocery_final_df['dcoilwtico'], y=grocery_final_df['sales'])
plt.xlabel('Oil Price')
plt.ylabel('Average Sales')

**Diagnostics for Grocery Sales, Promotion**

In [None]:
# Ave_grocery_sales Diagnostics


ave_grocery_sales = norm.rvs(size=1000)
sm.qqplot(ave_grocery_sales, line='45')
pylab.show()

ave_grocery_sales = np.random.normal(loc = 20, scale = 5, size=150)
statistic,pvalue = lilliefors(ave_grocery_sales)
print('statistic=%.3f, p=%.3f\n' %  (statistic, pvalue))
if pvalue > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

In [None]:
# ave_grocery_promo Diagnostics


ave_grocery_promo = norm.rvs(size=1000)
sm.qqplot(ave_grocery_promo, line='45')
pylab.show()

ave_grocery_promo = np.random.normal(loc = 20, scale = 5, size=150)
statistic,pvalue = lilliefors(ave_grocery_promo)
print('statistic=%.3f, p=%.3f\n' %  (statistic, pvalue))
if pvalue > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

In [None]:
#Exploring the correlation of numerical features
plt.figure(figsize=(5,3), dpi=150)
sns.heatmap(grocery_final_df.corr(), annot=True)

# **Regression Analysis by OLS and Random Forest**

# **First: OLS Regression with Average Daily Sales (All products)**

In [None]:
#Regression OLS -- Average Total Sales and explanatory variables

# Step 1: Create X and Y data matrices
X = final_df.drop(['sales'], axis = 1)
Y = final_df.drop(['dcoilwtico', 'onpromotion', 'Dummy Holiday'], axis = 1)

In [None]:
Y

In [None]:
X

In [None]:
Y = pd.DataFrame(Y)
Y

In [None]:
X = pd.DataFrame(X)
X

In [None]:
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [None]:
X = sm.add_constant(X)
X

In [None]:
# print regression result
# Average Daily Sales

est = sm.OLS(Y.astype(float), X.astype(float)).fit()
est
print(est.summary())

# **Second: OLS Regression with Average Grocery Sales**

In [None]:
# OLS Regression for Ave Grocery Sales

Y_grocery = grocery_final_df.drop(['dcoilwtico', 'onpromotion', 'Dummy Holiday'], axis = 1)
Y_grocery

In [None]:
X_grocery = grocery_final_df.drop(['sales'], axis = 1)
X_grocery

In [None]:
X_grocery = sm.add_constant(X_grocery)
X_grocery

In [None]:
#Regression result for Grocery Sales

model_g = sm.OLS(Y_grocery.astype(float), X_grocery.astype(float))
result = model_g.fit()
print(result.summary())

In [None]:
# Step 2: Import Library

from sklearn.model_selection import train_test_split

In [None]:
# Step 3: Perfrom 70/30 data split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
# Step 4: Check data dimension

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape


In [None]:
# Step 4: Import library for Linear Regression Model

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# **Third: Random Forest Regression with Ave Daily Sales (all products)**

In [None]:
# Random Forest for Regression

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)

In [None]:
Y_test

In [None]:
y_pred

In [None]:
print(y_pred.shape)

**Print Metrics of Random Forest Regression**

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('R2:', metrics.r2_score(Y_test, y_pred))
print('Explained Variance:', metrics.explained_variance_score(Y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

In [None]:
Y_test_copy = Y_test
df_pred = Y_test_copy
df_pred = df_pred.rename(columns={'sales' : 'Actual Values'})
df_pred

In [None]:
df_pred['Predicted Values'] = y_pred
df_pred['% Difference'] = abs(df_pred['Predicted Values'] - df_pred['Actual Values'])/df_pred['Actual Values']*100
df_pred

# **Fourth: Light GBM with Ave Sales**

In [None]:
# LGBM

from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as ltb

In [None]:
# Step 3: Perfrom 70/30 data split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
# Step 2 - Using LightGBM Regressor and calculating the scores

model = ltb.LGBMRegressor()
model.fit(X_train.astype(float), Y_train.astype(float))
print(); print(model)
    
expected_y  = Y_test.astype(float)
predicted_y = model.predict(X_test.astype(float))

In [None]:
expected_y

In [None]:
print(predicted_y.shape)
predicted_y

In [None]:
print(metrics.r2_score(expected_y, predicted_y))
print(metrics.mean_squared_log_error(expected_y, predicted_y))

In [None]:
df_lgbm = expected_y
df_lgbm['Predicted Values'] = predicted_y
df_lgbm['% Difference'] = abs(df_lgbm['Predicted Values'] - df_lgbm['sales'])/df_lgbm['sales']*100

df_lgbm = df_lgbm.rename(columns={'sales' : 'Actual Values'})
df_lgbm

# **Fifth: Random Forest Regression with Average Grocery Sales**

In [None]:
# Random Forest for Grocery Sales
# Step 3: Perfrom 70/30 data split

X_grocery_train, X_grocery_test, Y_grocery_train, Y_grocery_test = train_test_split(X_grocery, Y_grocery, test_size = 0.2)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_grocery_train = sc.fit_transform(X_grocery_train)
X_grocery_test = sc.transform(X_grocery_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=200, random_state=100)
regressor.fit(X_grocery_train, Y_grocery_train)
y_grocery_pred = regressor.predict(X_grocery_test)

In [None]:
y_grocery_pred

In [None]:
print(y_grocery_pred.shape)

In [None]:
Y_grocery_test

Print the Metrics of Random Forest Regression

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(Y_grocery_test, y_grocery_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_grocery_test, y_grocery_pred))
print('R2:', metrics.r2_score(Y_grocery_test, y_grocery_pred))
print('Explained Variance:', metrics.explained_variance_score(Y_grocery_test, y_grocery_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_grocery_test, y_grocery_pred)))

In [None]:
df_grf = Y_grocery_test
df_grf['Predicted Values'] = y_grocery_pred
df_grf['% Difference'] = abs(df_grf['Predicted Values'] - df_grf['sales'])/df_grf['sales']*100
df_grf

df_grf = df_grf.rename(columns={'sales' : 'Actual Values'})
df_grf

# **Sixth: Light GBM with Average Grocery Sales**

In [None]:
# Random Forest for Grocery Sales
# Step 3: Perfrom 70/30 data split

X_grocery_train, X_grocery_test, Y_grocery_train, Y_grocery_test = train_test_split(X_grocery, Y_grocery, test_size = 0.2)

In [None]:
model = ltb.LGBMRegressor()
model.fit(X_grocery_train.astype(float), Y_grocery_train.astype(float))
print(); print(model)
    
expected_g_y  = Y_grocery_test.astype(float)
predicted_g_y = model.predict(X_grocery_test.astype(float))

In [None]:
expected_g_y

In [None]:
predicted_g_y

In [None]:
print(metrics.r2_score(expected_g_y, predicted_g_y))
print(metrics.mean_squared_log_error(expected_g_y, predicted_g_y))

In [None]:
df_glgbm = expected_g_y
df_glgbm['Predicted Values'] = predicted_g_y
df_glgbm['% Difference'] = abs(df_glgbm['Predicted Values'] - df_glgbm['sales'])/df_glgbm['sales']*100


df_glgbm = df_glgbm.rename(columns={'sales' : 'Actual Values'})
df_glgbm

# **Seventh: Random Forest Regression with Average Daily Sales, with daily transactions per store location used in the regression**

In [None]:
# load reshaped transactions_data

transact_filepath = "../input/transaction-data-final/transactions_data_F.csv"

In [None]:
transact_data = pd.read_csv(transact_filepath, index_col="Id")

In [None]:
# Column Values refer to Store Codes
transact_data.head()

In [None]:
transact_data['date'] = pd.to_datetime(transact_data['date'], format ="%Y-%m-%d")

In [None]:
transact_data.mean()

In [None]:
transact_data = transact_data.fillna(transact_data.mean())
transact_data

In [None]:
# Merge the data set to include transactions per Store Number (Columns)
final_df2 = final_df.merge(transact_data, on = "date", how = 'left')
final_df2

In [None]:
# Set the index to Date
final_df3 = final_df2.set_index('date')
final_df3

In [None]:
# Check how many cells are NaN
final_df3.isnull().sum().sum()

In [None]:
# Fill NaN with mean values per Column or mean daily no. of transactions per Store location
final_df3 = final_df3.fillna(final_df3.mean())
final_df3

In [None]:
# Check if there are still NaN
final_df3.isnull().sum().sum()

In [None]:
# Step 1: Create X and Y data matrices
x_fin = final_df3.drop(['sales'], axis = 1)
y_fin = final_df3['sales']

In [None]:
x_fin

In [None]:
y_fin

In [None]:
y_fin = pd.DataFrame(y_fin)
y_fin

In [None]:
x_fin = pd.DataFrame(x_fin)
x_fin

In [None]:
# Step 2: Import Library

from sklearn.model_selection import train_test_split

In [None]:
# Step 3: Perfrom 70/30 data split

x_fin_train, x_fin_test, y_fin_train, y_fin_test = train_test_split(x_fin, y_fin, test_size = 0.3)

In [None]:
# Step 4: Import library for Linear Regression Model

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_fin_train = sc.fit_transform(x_fin_train)
x_fin_test = sc.transform(x_fin_test)

In [None]:
x_fin_train

In [None]:
x_fin_test

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_fin_train, y_fin_train)
y_fin_pred = regressor.predict(x_fin_test)

**Print Metrics of Random Forest Regression**

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_fin_test, y_fin_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_fin_test, y_fin_pred))
print('R2:', metrics.r2_score(y_fin_test, y_fin_pred))
print('Explained Variance:', metrics.explained_variance_score(y_fin_test, y_fin_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_fin_test, y_fin_pred)))

In [None]:
df_rfall = y_fin_test
df_rfall['Predicted Values'] = y_fin_pred
df_rfall['% Difference'] = abs(df_rfall['Predicted Values'] - df_rfall['sales'])/df_rfall['sales']*100


df_rfall = df_rfall.rename(columns={'sales' : 'Actual Values'})
df_rfall

# **Sixth: Light GBM Regression with Average Daily Sales, with daily transactions per store location used in the regression**

In [None]:
# LGBM

from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import lightgbm as ltb

In [None]:
x_f_train, x_f_test, y_f_train, y_f_test = train_test_split(x_fin, y_fin, test_size = 0.3)

In [None]:
# Step 2 - Using LightGBM Regressor and calculating the scores

model = ltb.LGBMRegressor()
model.fit(x_f_train.astype(float), y_f_train.astype(float))
print(); print(model)
    
expected_y  = y_f_test.astype(float)
predicted_y = model.predict(x_f_test.astype(float))

In [None]:
expected_y

In [None]:
predicted_y

In [None]:
print(metrics.r2_score(expected_y, predicted_y))
print(metrics.mean_squared_log_error(expected_y, predicted_y))

In [None]:
df_lgbmall = expected_y
df_lgbmall['Predicted Values'] = predicted_y
df_lgbmall['% Difference'] = abs(df_lgbmall['Predicted Values'] - df_lgbmall['sales'])/df_lgbmall['sales']*100


df_lgbmall = df_lgbmall.rename(columns={'sales' : 'Actual Values'})
df_lgbmall