# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import cufflinks as cf
import plotly.express as px
from plotly.offline import download_plotlyjs,iplot,plot,init_notebook_mode
init_notebook_mode(connected=True)
cf.go_offline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as st
from scipy.stats import ttest_1samp,ttest_ind,ttest_rel,f_oneway,chi2_contingency

# Reading Data sets

In [None]:
features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')
test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')

# Data Understanding

## Data Info

In [None]:
print('Features')
print(features.info())
print('***'*40,'\n')
print('stores')
print(stores.info())
print('***'*40,'\n')
print('train')
print(train.info())
print('***'*40,'\n')
print('test')
print(test.info())


## Shape of Loaded Data

In [None]:
print('Features')
print(features.shape)
print('***'*40,'\n')
print('stores')
print(stores.shape)
print('***'*40,'\n')
print('train')
print(train.shape)
print('***'*40,'\n')
print('test')
print(test.shape)
print('***'*40)


## First 5 rows of each loaded data set

In [None]:
features.head()

In [None]:
stores.head()

In [None]:
train.head()

In [None]:
test.head()

# Data Preparation and Feature Engineering

## Merging features and store datasets

In [None]:
df1 = pd.merge(features,stores,on='Store',how='inner')

## Merging df1 to train and test data sets simultaneously

In [None]:
df_train = pd.merge(df1,train,on=['Date','Store','IsHoliday'],how='inner')
df_test = pd.merge(df1,test,on=['Date','Store','IsHoliday'],how='inner')

## Labeling train and test data in a new column '(train/test)'

In [None]:
# Creating a column to identify the test and train data
df_train['train/test'] = 'train'
df_test['train/test'] = 'test'

## Adding weekly sales column to test data set


> Since, we do not have weekly_sales data in test data set, so we are imputing the column with null values.



In [None]:
df_test['Weekly_Sales'] = np.nan

## Concatenating train and test data set


> Concatenating the data sets to perform data preprocessing on the whole data.



In [None]:
data = pd.concat([df_train,df_test],axis=0,ignore_index=True)

## Understanding the merged data (complete data)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

**Inferences:**


1.   
2.   List item



In [None]:
# Checking the number of rows having weekly sales 0 and below
data[data['Weekly_Sales']<=0].shape

In [None]:
# Removing the rows having weekly sales 0 and below
data = data.drop(data[(data['Weekly_Sales']<=0)&(data['train/test']=='train')].index)

## Checking the null values in the final data set

In [None]:
data.isnull().sum()

In [None]:
# percentage null values in each column of the data set
(data.isnull().sum()/data.shape[0])*100

**Inferences**


1. List item
2.  



## Treating the null values

**Since, Markdown columns are explaining the marketing campaign details, also, the percentage null values are above 50%, so we will impute the null values with 0**

In [None]:
# Imputing the null values in markdown columns with 0
data.iloc[:,4:9]=data.iloc[:,4:9].fillna(0)

**Treating the null values in cpi and unemployment columns by imputing with 'forward fill' method because it is a huge data and we are having only 7% missing values in these two columns.**

In [None]:
data[['CPI','Unemployment']] = data[['CPI','Unemployment']].fillna(method='ffill')

In [None]:
# Again checking the null values 
data.isnull().sum()

**In order to identify and analyze whether the promoting markdown campaigns have any significant impact on sales or not, we will create a new feature markdown where 0 signifies promotional campaign not done and 1 if it is done**

In [None]:
data['markdown'] = data.iloc[:,4:9].sum(axis=1)
data['markdown'] = data['markdown'].apply(lambda x:0 if x==0 else 1)

In [None]:
data.head()

In [None]:
# Since, we know that we imputed the markdown1,markdown2,markdown3,markdown4,markdown5 columns with 0's initially. And now we have created a separate markdown column with 0 and 1 to tell
# if there was any markdown activity was done by the particular store or not. We will remove the promotional markdown columns as more than 50% of the values were missing

data = data.drop(data.iloc[:,4:9],axis=1)
data.head()

**Since, there are 81 departments for each store we will group the departments on the basis of the frequency of purchases from that department. Here we are grouping the departments into five types namely:'rare','less frequent','moderately frequent','very frequent','most frequent'**

In [None]:
print(round(data['Dept'].value_counts(normalize=True),4).describe(percentiles=[0.20,0.40,0.60,0.80]))
dep = pd.DataFrame(data['Dept'].value_counts(normalize=True).values,columns=['Dept_freq'])
dep['Dept'] = data['Dept'].value_counts(normalize=True).index
data_n = pd.merge(data,dep,on='Dept',how='inner')
labels = ['rare','less frequent','moderately frequent','very frequent','most frequent']
bins = [0,0.0115,0.0136,0.0149,0.0153,np.inf]
data_n['Dep_type'] = pd.cut(data_n['Dept_freq'],bins=bins,labels=labels)
data_n = data_n.drop('Dept_freq',axis=1)

**Type of Departments (on the basis of purchase frequency):**


*   Rare = 
*   List item



In [None]:
# converting Date into Datetime format
data_n['Date'] = pd.to_datetime(data_n['Date'],format='%Y-%m-%d',)


In [None]:
# Extracting week and year from the dates as we will not use dates in the model building
from datetime import date as dt
data_n['Week'] = data_n['Date'].dt.week
data_n['year'] = data_n['Date'].dt.year

In [None]:
data_n[data_n['IsHoliday']==True][['Date','Week','year','IsHoliday']].drop_duplicates()

**Inference:**


1.   We can see that Holidays are occuring in the same weeks for all the years.




In [None]:
# dropping some columns which will not be needed in the model
data_n = data_n.drop(['Date'],axis=1)

In [None]:
data_n.head()

In [None]:
# Checking the shape of the dataframe once again
print('Number of rows = {}\nNumber of Columns = {}'.format(data_n.shape[0],data_n.shape[1]))

# Univariate Analysis

**To check the count and balance of categorical variables we will plot bar plots for each categorical data**

In [None]:
# converting to object type data for univariate analysis
data_n[['Store','markdown','Dept','Dep_type','IsHoliday']] = data_n[['Store','markdown','Dept','Dep_type','IsHoliday']].astype('object')


In [None]:
# creating arrays of categorical and numerical column names
cat_col = data_n.select_dtypes(include='object').columns.drop('train/test')
num_col = data_n.select_dtypes(include='number').columns

## Checking the outliers through boxplots of numerical features present in the data set

In [None]:
plt.figure(figsize=(15,20))
for i,col in enumerate(num_col,1):
  plt.subplot(13,1,i)
  sns.boxplot(data_n[col])
  plt.ylabel(col)
plt.show()

**Percentage Outliers in the Numerical Columns**

In [None]:
for col in num_col:
  count=0
  q1,q3 = data_n[col].quantile([0.25,0.75])
  iqr = q3-q1
  uw = q3+1.5*iqr
  lw = q1-1.5*iqr
  for i in data_n[col]:
    if i<lw or i>uw:
      count+=1
  print('Percentage of outliers in '+col+' ={}'.format(count/data_n[col].shape[0]))

**Cehcking if multicollinearity is present between the independent variables**

In [None]:
cr = data_n.corr()
plt.figure(figsize=(10,10), dpi=80)
sns.heatmap(cr[(cr>=0.4)|(cr<=-0.4)],annot=True,cmap='coolwarm')

**Inference: From the above graph we observe that year and fuel price have high correlation among themselves thus we will remove the year column before fitting any model to the data**

In [None]:
import matplotlib.pylab as pylab
params = {'axes.labelsize':'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-small',
          'ytick.labelsize':'x-small'}
pylab.rcParams.update(params)

In [None]:
fig,axes = plt.subplots(2,2,figsize=(15,10),dpi=100)
for idx,col in enumerate(cat_col.drop(['Store','Dept'])):
  row,col1 = idx//2,idx%2
  sns.countplot(x = data_n[col],ax=axes[row,col1])
plt.show()

**Inferences:**


1.   Data taken on the holidays is less than normal days
2.   Number of Type A stores are highest and Type C stores are lowest in the US
3.   The number of stores doing promotions are equal in proportion to the number of stores not doing the promotions.
4.   Number of Departments of Most frequently type are highest in numbers as compared to others.  



In [None]:
fig,axes = plt.subplots(2,1,figsize=(25,8),dpi=100)
for idx,col in enumerate(['Store','Dept']):
  row = idx//1
  sns.countplot(x = data_n[col],ax=axes[row],palette='icefire')
plt.show()

**Inferences:**

1. From the data we can observe that some stores are visited more frequently by people.
2. From the second graph we can see that people have purchased more items from some particular departments than compared to others.

In [None]:
data_n_train = data_n[data_n['train/test']=='train']
data_n_test = data_n[data_n['train/test']=='test']

In [None]:
params1 = {'axes.labelsize':'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'medium',
          'ytick.labelsize':'medium'}
pylab.rcParams.update(params1)

In [None]:
data_n_train.groupby(['year','Week']).agg({'Weekly_Sales':['mean']}).plot(figsize=(18,8))
plt.ylabel('Average weekly Sales')
plt.show()

**Inference:** 
1. From the above graph we can observe that there is some seasonality in the Average weekly sales over the course of approximately 3 years.
2. We can also observe that Average weekly sales is high at the end of the year.

In [None]:
week_sales_2010 = data_n_train[data_n_train['year']==2010].groupby('Week')['Weekly_Sales'].agg('mean')
week_sales_2011 = data_n_train[data_n_train['year']==2011].groupby('Week')['Weekly_Sales'].agg('mean')
week_sales_2012 = data_n_train[data_n_train['year']==2012].groupby('Week')['Weekly_Sales'].agg('mean')
plt.figure(figsize=(18,8),dpi=100)
sns.lineplot(week_sales_2010.index,week_sales_2010.values)
sns.lineplot(week_sales_2011.index,week_sales_2011.values)
sns.lineplot(week_sales_2012.index,week_sales_2012.values)
plt.legend(['2010','2011','2012'])
plt.ylabel('Average Weekly Sales')
plt.xticks(np.arange(1,52,step=1))
plt.show()

In [None]:
#
plt.figure(figsize=(15,10), dpi=100)
sns.barplot(x='Store',y='Weekly_Sales',data=data_n_train,color='grey')


**Inference: From the above graph we can see that there is a significant difference in the average weekly sales of different stores.**

**Statistical test to prove that there is a significant difference in the average weekly sales of different stores**

In [None]:
store_grp = data_n_train['Weekly_Sales'].groupby(data_n_train['Store'])

**Checking the normality of the samples**

**Hypothesis**

    H0: The data set is normal
    H1: The data set is not normal

In [None]:
for i in range(1,data_n_train['Store'].nunique()+1):
  print('Store '+str(i))
  print()
  print(st.shapiro(store_grp.get_group(i)))
  print('***'*40)

**From the above results we observe that p-value < 0.05(alpha), thus the test rejects null Hypothesis, that is the data is not normal. Since, the data is not normal we have to use non parametric test to check whether the means of each of these samples are equal or not.**

**Hypotheis for checking equalence of means between samples:**

    H0: The means of the samples are equal
    H1: The means of the samples are not equal

In [None]:
st.kruskal(store_grp.get_group(1),store_grp.get_group(2),store_grp.get_group(3),store_grp.get_group(4),store_grp.get_group(5),store_grp.get_group(6),store_grp.get_group(7),store_grp.get_group(8),
           store_grp.get_group(9),store_grp.get_group(10),store_grp.get_group(11),store_grp.get_group(12),store_grp.get_group(13),store_grp.get_group(14),store_grp.get_group(15),store_grp.get_group(16),
           store_grp.get_group(17),store_grp.get_group(18),store_grp.get_group(19),store_grp.get_group(20),store_grp.get_group(21),store_grp.get_group(22),store_grp.get_group(23),store_grp.get_group(24),
           store_grp.get_group(25),store_grp.get_group(26),store_grp.get_group(27),store_grp.get_group(28),store_grp.get_group(29),store_grp.get_group(30),store_grp.get_group(31),store_grp.get_group(32),
           store_grp.get_group(33),store_grp.get_group(34),store_grp.get_group(35),store_grp.get_group(36),store_grp.get_group(37),store_grp.get_group(38),store_grp.get_group(39),store_grp.get_group(40),
           store_grp.get_group(41),store_grp.get_group(42),store_grp.get_group(43),store_grp.get_group(44),store_grp.get_group(45))

**Inference: From the above test we can see that pvalue<0.05 (alpha) which means we reject null hypothesis. Which means that there is a significant difference in the weekly sales of each store of walmart** 

In [None]:
plt.figure(figsize=(20,10), dpi=100)
sns.barplot(x='Dept',y='Weekly_Sales',data=data_n_train)


In [None]:
#
plt.figure(figsize=(15,10))
sns.boxplot(x='Type',y='Weekly_Sales',data=data_n_train,showfliers=False)


**Statistical Significance (non parametric) Test for Assessing the Weekly Sales on the basis of Type of Store**

In [None]:
df_typeA = data_n_train[data_n_train['Type']=='A']['Weekly_Sales']
df_typeB = data_n_train[data_n_train['Type']=='B']['Weekly_Sales']
df_typeC = data_n_train[data_n_train['Type']=='C']['Weekly_Sales']


**Checking the normality of the samples**

**Hypothesis**

    H0: The data set is normal
    H1: The data set is not normal

In [None]:
for i,sample in enumerate([df_typeA,df_typeB,df_typeC]):
  s,p = st.shapiro(sample)
  print('The P-value for the above test for the sample '+str(i)+' ={}'.format(p))

**From the above results we can see that the pvalues < 0.05, Thus, we reject null Hypothesis which means that the data is not normal. Thus we will apply non parametric test to check whether there is a significant difference in sales due to the type of the store.**

In [None]:
print(st.kruskal(df_typeA,df_typeB,df_typeC))

**From the above result we can see that since the p-value < alpha, we reject the null hypothesis that means there is a significant effect of Type of store on the weekly sales.** 

In [None]:
plt.figure(figsize=(15,8), dpi=100)
sns.boxplot(x='Dep_type',y='Weekly_Sales',data=data_n_train,hue='IsHoliday',showfliers=False)

**Inference:** 
 

1.   From the above graph we can say that the most frequently visited departments gave the highest weekly sales.
2.   There is not much difference in the average weekly sales whether or not there is a holiday or not.



In [None]:
plt.figure(figsize=(20,8), dpi=100)
plt.subplot(1,2,1)
sns.boxplot(x='Dep_type',y='Weekly_Sales',data=data_n_train)
plt.subplot(1,2,2)
sns.violinplot(x='Dep_type',y='Weekly_Sales',data=data_n_train,hue='IsHoliday')

**Statistical Significance Test for Assessing the Weekly Sales on the basis of Type of Department Type**

In [None]:
df_dt1 = data_n_train[data_n_train['Dep_type']=='most frequent']['Weekly_Sales']
df_dt2 = data_n_train[data_n_train['Dep_type']=='very frequent']['Weekly_Sales']
df_dt3 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']
df_dt4 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']
df_dt5 = data_n_train[data_n_train['Dep_type']=='moderately frequent']['Weekly_Sales']

**Checking the normality of the samples**

**Hypothesis**

    H0: The data set is normal
    H1: The data set is not normal

In [None]:
for i,sample in enumerate([df_dt1,df_dt2,df_dt3,df_dt4,df_dt5]):
  s,p = st.shapiro(sample)
  print('The P-value for the above test for the sample '+str(i)+' ={}'.format(p))

**From the above results we can see that the pvalues < 0.05, Thus, we reject null Hypothesis which means that the data is not normal. Thus we will apply non parametric test to check whether there is a significant difference in sales due to the type of the department.**

In [None]:
print(st.kruskal(df_dt1,df_dt2,df_dt3,df_dt4,df_dt5))

**Since, the pvalue < 0.05 (alpha), means we reject null hypothesis thus, we can say that there is a significant difference in the weekly sales due to department type.**

**From the above graph we can infer that:**

1.   We can see that there is not much difference in weekly sales in any type of departments when there is holiday or not, except the departments which are moderately visited as the outliers are more on holidays. That means, during particular holidays people tend to by more products from moderately visited departments.



In [None]:
# Analyzing the Average sales at each store on normal and holidays through vizualizations
plt.figure(figsize=(15,10), dpi=100)
pd.crosstab(index = data_n_train['Store'],columns = data_n_train['IsHoliday'],values = data_n_train['Weekly_Sales'],aggfunc='mean').plot(kind='bar',figsize=(15,10))

In [None]:
data_n_train['Weekly_Sales'].groupby(data_n_train['markdown']).mean().plot(kind='bar')

**Inference: From the above graph we can observe that there is no significant difference in average weekly sales with or without promotional activities.**

**Statistical test to prove if there is a significant difference in the average weekly sales due to promotional markdowns**

In [None]:
mark_grp = data_n_train['Weekly_Sales'].groupby(data_n_train['markdown'])

**Checking the normality of the samples**

**Hypothesis**

    H0: The data set is normal
    H1: The data set is not normal

In [None]:
for i in range(data_n_train['markdown'].nunique()):
  print('markdown = {}'.format(i))
  print()
  print('P-value = {}'.format(st.shapiro(mark_grp.get_group(i))[1]))
  print('***'*40)

**Inference: From the above results we can see that the p-value < 0.05(alpha), thus we reject the null hypothesis, that is the data in samples are not normal.**

**Hypotheis for checking equalence of means between samples:**

    H0: The means of the samples are equal
    H1: The means of the samples are not equal

In [None]:
print('P-value = {}'.format(st.mannwhitneyu(mark_grp.get_group(0),mark_grp.get_group(1))[1]))

**Inference: From the above results we can observe that p-value>0.05(alpha), which means the test failed to reject null hypothesis. Thus, we can conclude that there is no significant difference in the weekly sales due to promotional markdowns run by the Walmart stores across the US.**

In [None]:
plt.figure(figsize=(10,10), dpi=100)
plt.subplot(2,2,1)
sns.scatterplot(x='Temperature',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,2)
sns.scatterplot(x='Fuel_Price',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,3)
sns.scatterplot(x='CPI',y='Weekly_Sales',data=data_n_train)
plt.subplot(2,2,4)
sns.scatterplot(x='Unemployment',y='Weekly_Sales',data=data_n_train)
plt.show()

In [None]:
df_new_train = data_n_train.drop(['train/test','year'],axis=1)
df_new_train.head()

In [None]:
df_new_test = data_n_test.drop(['train/test','year','Weekly_Sales'],axis=1)
df_new_test.head()

# Shuffling the Train and Test data set

In [None]:
df_new_train = df_new_train.sample(frac=1,random_state=10).reset_index(drop=True)
df_new_train.head()

In [None]:
df_new_test = df_new_test.sample(frac=1,random_state=10).reset_index(drop=True)
df_new_test.head()

In [None]:
df_new_train.info()

In [None]:
df_new_test.info()

In [None]:
df_new_train= df_new_train.drop(['Dept','Store'],axis=1)
df_new_test= df_new_test.drop(['Dept','Store'],axis=1)


In [None]:
df_new_train['IsHoliday'] = df_new_train['IsHoliday'].replace({False:0,True:1}) 
df_new_train['Type'] = df_new_train['Type'].replace({'A':3,'B':2,'C':1})
df_new_train['Dep_type'] = df_new_train['Dep_type'].replace({'most frequent':5,'very frequent':4,'moderately frequent':3,'less frequent':2,'rare':1}) 
df_new_train['markdown'] = df_new_train['markdown'].astype('int64')


In [None]:
df_new_test['IsHoliday'] = df_new_test['IsHoliday'].replace({False:0,True:1}) 
df_new_test['Type'] = df_new_test['Type'].replace({'A':3,'B':2,'C':1})
df_new_test['Dep_type'] = df_new_test['Dep_type'].replace({'most frequent':5,'very frequent':4,'moderately frequent':3,'less frequent':2,'rare':1}) 
df_new_test['markdown'] = df_new_test['markdown'].astype('int64')


In [None]:
df_new_train.head()

In [None]:
df_new_test.head()

In [None]:
df_new_train.dtypes

In [None]:
# removing the outliers from the columns of both train dataset except 'Weekly Sales' column 

## removing outliers from the 
for col in df_new_train.columns.drop(['Weekly_Sales','IsHoliday','markdown','Week']):
  q1,q3 = df_new_train[col].quantile([0.25,0.75])
  iqr = q3-q1
  lw = q1-1.5*iqr
  uw = q3+1.5*iqr
  df_new_train[col] = df_new_train[col].apply(lambda x: lw if x < lw else x)
  df_new_train[col] = df_new_train[col].apply(lambda x: uw if x > uw else x)



In [None]:
# checking if the outliers are removed through box plot
plt.figure(figsize=(15,10))
for i,col in enumerate(df_new_train.columns.drop(['Weekly_Sales','IsHoliday','markdown','Week']),1):
  plt.subplot(7,1,i)
  sns.boxplot(df_new_train[col])
  plt.ylabel(col)
  plt.tight_layout()
plt.show()

In [None]:
# distribution of the data points in the column using KDE plot 
plt.figure(figsize=(15,15))
for i,col in enumerate(df_new_train.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_new_train[col])
  plt.tight_layout()

# Creating two separate Data sets: With outliers in Target Variable, Without Outliers in the Target Variable

In [None]:
q1,q3 = df_new_train['Weekly_Sales'].quantile([0.25,0.75])
iqr = q3-q1
lw = q1-1.5*iqr
uw = q3+1.5*iqr

In [None]:
# creating data set without outliers
df_outliersna = df_new_train[(df_new_train['Weekly_Sales']>lw)&(df_new_train['Weekly_Sales']<uw)]

In [None]:
df_outliersna.head()

In [None]:
# dropping the z score columns from the data set
df_outliersna = df_outliersna.drop('wekly_sales_z_score',axis=1) 

In [None]:
df_outliersna.info()

In [None]:
df_outliersna.describe()

In [None]:
# checking the distribution of the variables in the data set
plt.figure(figsize=(15,20))
for i,col in enumerate(df_outliersna.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_outliersna[col])
  plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_outliersna.corr(),annot=True,cmap='coolwarm')
plt.show()

In [None]:
# creating data set with outliers only 
df_outliers = df_new_train[(df_new_train['Weekly_Sales']<lw)|(df_new_train['Weekly_Sales']>uw)]

In [None]:
df_outliers.head()

In [None]:
df_outliers = df_outliers.drop('wekly_sales_z_score',axis=1)

In [None]:
df_outliers.info()

In [None]:
df_outliers.describe()

In [None]:
# checking the distribution of the variables in the data set
plt.figure(figsize=(15,20))
for i,col in enumerate(df_outliers.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_outliers[col])
  plt.tight_layout()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_outliers.corr(),annot=True,cmap='coolwarm')
plt.show()

# Modeling for the whole data set

In [None]:
# Doing logarithmic transformation to make the data less skewed
## We will not consider logarithmic transformation of columns: ISHoliday,Type, Markdown,and Dep_type because these columns are encoded and thus logarithmic tranformation won't be a right choice to deal with this data

for col in df_new_train.columns.drop(['IsHoliday','Type','markdown','Dep_type']):
  df_new_train[col] = np.log(df_new_train[col])

In [None]:
# distribution of the data points in the column using KDE plot 
plt.figure(figsize=(15,15))
for i,col in enumerate(df_new_train.columns,1):
  plt.subplot(4,3,i)
  sns.distplot(df_new_train[col])
  plt.tight_layout()

# Model Building

**Assigning indepndent and Dependent variables**

In [None]:
# assigning the independent and the dependent variables
x = df_new_train.drop('Weekly_Sales',axis=1)
y = df_new_train['Weekly_Sales']

### Building Base Model

In [None]:
import statsmodels.api as sm
xc = sm.add_constant(x)
model = sm.OLS(y,xc).fit()
model.summary()

**From the above model summary we can see that markdown column is insignificant in determining the Weekly Sales.** 

In [None]:
# removing the markdown column
xc1 = xc.drop('markdown',axis=1)
model1 = sm.OLS(y,xc1).fit()
model1.summary()

In [None]:
model1.params

In [None]:
resids = model1.resid

In [None]:
y_pred = model1.predict(xc)

### Testing the Assumptions

**Test of normality**
  
    H0: The residuals are normal
    H1: The residuals are not normal

In [None]:
# testing the normality of the residuals using jarqued-bera test
print('P-value  = {}'.format(st.jarque_bera(resids,)))

In [None]:
sns.distplot(resids,fit=st.norm)

In [None]:
st.probplot(resids,plot=plt)
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
pd.DataFrame([vif(xc.values,i) for i in range(xc.shape[1])],index = xc.columns,columns=['vif']).sort_values('vif',ascending=False)

In [None]:
import statsmodels.stats.api as sms
print(sms.het_goldfeldquandt(resids,xc))

In [None]:
print(sm.stats.diagnostic.linear_rainbow(model1))

In [None]:
df2 = df_new_train.sample(frac=0.1,random_state = 10).reset_index(drop=True)
df2.head()

In [None]:
df2.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score,KFold,train_test_split
x = df2.drop('Weekly_Sales',axis=1)
y = df2['Weekly_Sales']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42)

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,StackingClassifier
import lightgbm as lgb

In [None]:
# creating instances for each Regression Machine Learning Algorithm
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
lgbmr = lgb.LGBMRegressor()

**Modeling**

In [None]:
def model_res(algo,x_train=x_train,x_test=x_test,y_train=y_train,y_test=y_test):
  algo.fit(x_train,y_train)
  cof_df = pd.DataFrame(algo.coef_,index=x_train.columns,columns=['Coefs_lr'])
  print(cof_df)
  print()
  print('Intercept = {}'.format(algo.intercept_))
  print('***'*40)
  y_pred_train = algo.predict(x_train)
  y_pred_test = algo.predict(x_test)

  print('Evaluation of the model on Train data set')
  print('R-squared = {}'.format(r2_score(y_train,y_pred_train)))
  print('RMSE = {}'.format(np.sqrt(mean_squared_error(y_train,y_pred_train))))
  print('MAE = {}'.format(mean_absolute_error(y_train,y_pred_train)))
  print('***'*40)
  print('Evaluation of the model on Test data set')
  print('R-squared = {}'.format(r2_score(y_test,y_pred_test)))
  print('RMSE = {}'.format(np.sqrt(mean_squared_error(y_test,y_pred_test))))
  print('MAE = {}'.format(mean_absolute_error(y_test,y_pred_test)))