In [112]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn import ensemble
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,ridge_regression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.


In [94]:
givenTrain = pd.read_csv("../input/train.csv",parse_dates = ['Date'])
givenTest = pd.read_csv("../input/test.csv",parse_dates = ['Date'])
givenStore = pd.read_csv("../input/store.csv")

In [95]:
givenTrain[givenTrain.StateHoliday=="0"] = 0 

In [96]:
givenTrain.isnull().sum()

In [97]:
givenTest.isnull().sum()

In [99]:
givenStore.isnull().sum()

Checking Data Sanity: 
    Making sure no erroneos value exist.

In [100]:
givenTrain[(givenTrain['Open'] == 0) & (givenTrain['Sales'] != 0)]

For this analysis, only Stores that are open are considered.

In [101]:
openStores = givenTrain[givenTrain['Open'] == 1]


Categorical values are converted to numerical values for analysis.

In [102]:
openStores.drop(columns='Date',axis=1,inplace =True)

In [103]:
openStores = pd.get_dummies(openStores)
storeData = pd.get_dummies(givenStore)

In [104]:
storeData.dtypes

Merging training and store data

In [105]:
mergeData = pd.merge(openStores, storeData, how="inner")

Handling Null Values 

In [107]:
mergeData.isna().sum()

Varaibles related to Competition are dropped and Promo2SinceWeek, Promo2SinceYear are dropped for now.

In [110]:
cleanData= mergeData.drop(mergeData[['CompetitionDistance','CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear']],axis= 1)

In [113]:
cleanData.isna().sum()

In [114]:
X = cleanData.drop(cleanData[['Sales', 'Store','Open']],axis = 1) # Store and Open have no explanatory power and sales is the target variable.
y = cleanData['Sales'] 

Feature Selection:
 Correlated variables are checked using a heat map shown below.
 Based on the results (highly correlated positive as well negative), certain variables are removed from the model.


In [119]:
corr = cleanData.corr()
sns.heatmap(corr)

From the correlation plot, its evident that 'assortment_a' and 'assortment_c' are correlated.

Also following variables
'store_type c' and 'store_type a' are correlated

'StateHoliday _a' and 'StateHoliday_0' 

In [120]:
X1 = cleanData.drop(cleanData[['Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
y = cleanData['Sales']

In [121]:
X2 = sm.add_constant(X1)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [124]:
def unRestrictedModel(cleanData):
    predictiveVariables = cleanData.drop(cleanData[['Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
    toPredict = cleanData['Sales']
    X_train, X_test, y_train, y_test = train_test_split(predictiveVariables, toPredict, test_size=0.33, random_state=42)
    lm = LinearRegression()
    lfit = lm.fit(X_train,y_train)
    yPredict = lfit.predict(X_test)
    return np.sqrt(mean_absolute_error(y_test,yPredict))
#unRestrictedModel(cleanData)
    

In [47]:
def RestrictedModel(cleanData): #Dropping promo
    predictiveVariables = cleanData.drop(cleanData[['Promo','Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
    toPredict = cleanData['Sales']
    X_train, X_test, y_train, y_test = train_test_split(predictiveVariables, toPredict, test_size=0.33, random_state=42)
    lm = LinearRegression()
    lfit = lm.fit(X_train,y_train)
    yPredict = lfit.predict(X_test)
    return np.sqrt(mean_absolute_error(y_test,yPredict))
#RestrictedModel(cleanData)

In [125]:
randomNess = 0
for i in range(1,101):
    f=(i/100)
    df =cleanData.sample(frac = f ,replace=True)
    errorFullModel = unRestrictedModel(df)
    errorNonPromoModel = RestrictedModel(df)
    if errorFullModel > errorNonPromoModel:
        randomNess = randomNess + 1
le= len(cleanData)
print(randomNess)
pVal = randomNess/100
pVal

**Intrepretation**:
  
   Bootstrapping the given data for about 100 times,  the model with promos fits the data better than the model without promos.(As it is evident from p value)
   
   



**Supporting the above claim with other regressors**:

Checking if Mean Absolute Error changes with respect to the presence of Promo

In [129]:
# Gradient Boosting Regressor with promos included in explanatory variables
X = cleanData.drop(cleanData[['Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
Y = cleanData['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.02, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
mae = mean_absolute_error(y_test, clf.predict(X_test))
print("Mean Absolute Error with Promos included: %.4f" % mae)

# Gradient Boosting Regressor after removing promos in explanatory variables
X = cleanData.drop(cleanData[['Promo','Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
Y = cleanData['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.02, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
mae = mean_absolute_error(y_test, clf.predict(X_test))
print("Mean Absolute Error without Promos included: %.4f" % mae)

In [130]:
rf = RandomForestRegressor(n_estimators=100)
X = cleanData.drop(cleanData[['Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
Y = cleanData['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
rf.fit(X,y)
mae = mean_absolute_error(y_test, rf.predict(X_test))
print("MAE with Promos included: %.4f" % mae)

rf1 = RandomForestRegressor(n_estimators=100)
X = cleanData.drop(cleanData[['Promo','Sales', 'Store','Open','PromoInterval_Feb,May,Aug,Nov','StoreType_a','Assortment_a','StateHoliday_a']],axis = 1)
Y = cleanData['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
rf1.fit(X,y)
mae1 = mean_absolute_error(y_test, rf1.predict(X_test))
print("MAE without Promos included: %.4f" % mae1)

**Conclusion: **

From the above results, one can conclude that the variable 'Promo' has significance in explaining the sales of Rossmann Data.