## **Acknowledgements**
#### This kernel uses such good kernels:
   - https://www.kaggle.com/ayushikaushik/eda-modelling-cross-validation
   - https://www.kaggle.com/yepp2411/walmart-prediction-1-eda-with-time-and-space
   - https://www.kaggle.com/avelinocaio/walmart-store-sales-forecasting

<a class="anchor" id="0.1"></a>
## **Table of Contents**
1. [Import libraries](#1)
2. [Download datasets](#2)
3. [EDA](#3)
4. [Preparing to modeling](#4)
5. [Prediction](#5)

<a class="anchor" id="1"></a>
## 1. Import libraries 
##### [Back to Table of Contents](#0.1)

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from scipy import stats
from scipy.special import boxcox1p

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, plot_tree

import warnings
warnings.filterwarnings("ignore")

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

<a class="anchor" id="2"></a>
## 2. Download datasets 
##### [Back to Table of Contents](#0.1)

In [None]:
features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')
test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
sample_submission = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')

In [None]:
features.head()

In [None]:
train.head()

In [None]:
stores.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
full_feat = features.merge(stores, how='inner', on='Store')

In [None]:
full_feat.head(5)


<a class="anchor" id="3"></a>
## 3. EDA
##### [Back to Table of Contents](#0.1)



In [None]:
pd.DataFrame(full_feat.dtypes, columns=['Type'])
full_feat.info()

In [None]:
train.head(5)

In [None]:
pd.DataFrame({'Type_Train': train.dtypes, 'Type_Test': test.dtypes})

In [None]:
print("train info\n")
train.info()
print("***"*16,"\ntest info\n")
test.info()

In [None]:
full_feat.Date = pd.to_datetime(full_feat.Date)
train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)

In [None]:
full_feat['Week'] = full_feat.Date.dt.isocalendar().week
full_feat['Year'] = full_feat.Date.dt.isocalendar().year

In [None]:
full_feat.head()

In [None]:
train_detail = train.merge(full_feat, 
                           how='inner',
                           on=['Store','Date','IsHoliday']).sort_values(by=['Store',
                                                                            'Dept',
                                                                            'Date']).reset_index(drop=True)

In [None]:
test_detail = test.merge(full_feat, 
                           how='inner',
                           on=['Store','Date','IsHoliday']).sort_values(by=['Store',
                                                                            'Dept',
                                                                            'Date']).reset_index(drop=True)

In [None]:
train_detail.head()

In [None]:
test_detail.head()

In [None]:
tr_null = (train_detail.isnull().sum()/len(train_detail)).sort_values(ascending = False)
tr_null2 = train_detail.isnull().sum()
null_data = pd.concat([tr_null,tr_null2],axis = 1).rename(columns = {0:"% of Null", 
                                                                     1:"# of Null"})
null_data = null_data[null_data['# of Null']>1]
null_data

In [None]:
pysqldf("""
SELECT
    T.*,
    case
        when ROW_NUMBER() OVER(partition by Year order by week) = 1 then 'Super Bowl'
        when ROW_NUMBER() OVER(partition by Year order by week) = 2 then 'Labor Day'
        when ROW_NUMBER() OVER(partition by Year order by week) = 3 then 'Thanksgiving'
        when ROW_NUMBER() OVER(partition by Year order by week) = 4 then 'Christmas'
    end as Holyday,
    case
        when ROW_NUMBER() OVER(partition by Year order by week) = 1 then 'Sunday'
        when ROW_NUMBER() OVER(partition by Year order by week) = 2 then 'Monday'
        when ROW_NUMBER() OVER(partition by Year order by week) = 3 then 'Thursday'
        when ROW_NUMBER() OVER(partition by Year order by week) = 4 and Year = 2010 then 'Saturday'
        when ROW_NUMBER() OVER(partition by Year order by week) = 4 and Year = 2011 then 'Sunday'
        when ROW_NUMBER() OVER(partition by Year order by week) = 4 and Year = 2012 then 'Tuesday'
    end as Day
    from(
        SELECT DISTINCT
            Year,
            Week,
            case 
                when Date <= '2012-11-01' then 'Train Data' else 'Test Data' 
            end as Data_type
        FROM full_feat
        WHERE IsHoliday = True) as T""")

In [None]:
weekly_sales_2010 = train_detail[train_detail.Year==2010]['Weekly_Sales'].groupby(train_detail['Week']).mean()
weekly_sales_2011 = train_detail[train_detail.Year==2011]['Weekly_Sales'].groupby(train_detail['Week']).mean()
weekly_sales_2012 = train_detail[train_detail.Year==2012]['Weekly_Sales'].groupby(train_detail['Week']).mean()
plt.figure(figsize=(20,8))
sns.lineplot(weekly_sales_2010.index, weekly_sales_2010.values)
sns.lineplot(weekly_sales_2011.index, weekly_sales_2011.values)
sns.lineplot(weekly_sales_2012.index, weekly_sales_2012.values)
plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2010', '2011', '2012'], loc='best', fontsize=16)
plt.title('Average Weekly Sales - Per Year', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Week', fontsize=16)
plt.show()

In [None]:
train_detail.loc[(train_detail.Year==2010) & (train_detail.Week==13), 'IsHoliday'] = True
train_detail.loc[(train_detail.Year==2011) & (train_detail.Week==16), 'IsHoliday'] = True
train_detail.loc[(train_detail.Year==2012) & (train_detail.Week==14), 'IsHoliday'] = True
test_detail.loc[(test_detail.Year==2013) & (test_detail.Week==13), 'IsHoliday'] = True

In [None]:
weekly_sales_mean = train_detail['Weekly_Sales'].groupby(train_detail['Date']).mean()
weekly_sales_median = train_detail['Weekly_Sales'].groupby(train_detail['Date']).median()
plt.figure(figsize=(20,8))
sns.lineplot(weekly_sales_mean.index, weekly_sales_mean.values)
sns.lineplot(weekly_sales_median.index, weekly_sales_median.values)
plt.grid()
plt.legend(['Mean', 'Median'], loc='best', fontsize=16)
plt.title('Weekly Sales - Mean and Median', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Date', fontsize=16)
plt.show()

In [None]:
weekly_sales = train_detail['Weekly_Sales'].groupby(train_detail['Store']).mean()
plt.figure(figsize=(20,8))
sns.barplot(weekly_sales.index, weekly_sales.values, palette='dark')
plt.grid()
plt.title('Average Sales - per Store', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Store', fontsize=16)
plt.show()

In [None]:
weekly_sales = train_detail['Weekly_Sales'].groupby(train_detail['Dept']).mean()
plt.figure(figsize=(25,8))
sns.barplot(weekly_sales.index, weekly_sales.values, palette='dark')
plt.grid()
plt.title('Average Sales - per Dept', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Dept', fontsize=16)
plt.show()

In [None]:
sns.set(style="white")

corr = train_detail.corr()

mask = np.triu(np.ones_like(corr, dtype=np.bool))

f, ax = plt.subplots(figsize=(20, 15))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

plt.title('Correlation Matrix', fontsize=18)

sns.heatmap(corr, mask=mask, cmap='summer', vmax=.3, center=0,
            square=True, linewidths=.5, 
            cbar_kws={"shrink": .5}, annot=True)

plt.show()

In [None]:
train_detail = train_detail.drop(columns=
                                 ['Fuel_Price','MarkDown1',
                                  'MarkDown2','MarkDown3',
                                  'MarkDown4','MarkDown5'])
test_detail = test_detail.drop(columns=
                               ['Fuel_Price','MarkDown1',
                                'MarkDown2','MarkDown3',
                                'MarkDown4','MarkDown5'])

In [None]:
def make_discrete_plot(feature):
    fig = plt.figure(figsize=(20,8))
    gs = GridSpec(1,2)
    sns.boxplot(y=train_detail.Weekly_Sales, x=train_detail[feature], ax=fig.add_subplot(gs[0,0]))
    plt.ylabel('Sales', fontsize=16)
    plt.xlabel(feature, fontsize=16)
    sns.stripplot(y=train_detail.Weekly_Sales, x=train_detail[feature], ax=fig.add_subplot(gs[0,1]))
    plt.ylabel('Sales', fontsize=16)
    plt.xlabel(feature, fontsize=16)
    fig.show()

In [None]:
def make_continuous_plot(feature):
    
    fig = plt.figure(figsize=(18,8))
    gs = GridSpec(1,2)
    
    j = sns.distplot(train_detail[feature], ax=fig.add_subplot(gs[0,1]), color = 'green')

    plt.title('Distribution\n')
    
    j = sns.scatterplot(y=train_detail['Weekly_Sales'], 
                        x=train_detail[feature], ax=fig.add_subplot(gs[0,0]), color = 'red')

    plt.title('Linear\n' + 'Corr: ' + str(np.round(train_detail['Weekly_Sales']\
                                                   .corr(train_detail[feature]),2)) + ', Skew: ' + 
               str(np.round(stats.skew(train_detail[feature], nan_policy='omit'),2)))
    
    fig.show()

In [None]:
make_discrete_plot('IsHoliday')

In [None]:
make_discrete_plot('Type')

In [None]:
labels = stores.Type.value_counts().index.tolist()
sizes = stores.Type.value_counts().values.tolist()
explode = (0.05, 0.02, 0)
plt.figure(figsize=(5,5))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=60,
        textprops={'fontsize': 18},colors=['#f538cc','#fa5282','#facc69'])
plt.title('Different types of stores');

In [None]:
train_detail.Type.replace({'A':1,'B':2,'C':3}, inplace = True)
test_detail.Type.replace({'A':1,'B':2,'C':3}, inplace = True)

In [None]:
make_continuous_plot('Temperature')

In [None]:
make_continuous_plot('CPI')

In [None]:
make_continuous_plot('Unemployment')

In [None]:
make_continuous_plot('Size')

In [None]:
train_detail = train_detail.drop(columns=['Temperature','Unemployment','CPI'])
test_detail = test_detail.drop(columns=['Temperature','Unemployment','CPI'])

<a class="anchor" id="4"></a>
## 4. Preparing to modeling
##### [Back to Table of Contents](#0.1)

In [None]:
train_detail.head()

In [None]:
X_train = train_detail[['Store','Dept','IsHoliday','Size','Week','Type','Year']]
Y_train = train_detail['Weekly_Sales']

<a class="anchor" id="5"></a>
## 5. Prediction
##### [Back to Table of Contents](#0.1)


In [None]:
RF = RandomForestRegressor(n_estimators=58, max_depth=27, max_features=6, min_samples_split=3, min_samples_leaf=1)
RF.fit(X_train, Y_train)

In [None]:
plt.figure(figsize=(20,12))
plot_tree(RF.estimators_[0], filled=True, rounded=True,feature_names=X_train.columns) 

In [None]:
X_test = test_detail[['Store', 'Dept', 'IsHoliday', 'Size', 'Week', 'Type', 'Year']]
predict = RF.predict(X_test)

In [None]:
Final = X_test[['Store', 'Dept', 'Week']]
Final['Weekly_Sales'] = predict

In [None]:
Final_adj = pysqldf("""
    SELECT
        Store,
        Dept,
        Week,
        Weekly_Sales,
        case 
            when Week = 52 and last_sales > 2*Weekly_Sales then Weekly_Sales+(3/7)*last_sales
            else Weekly_Sales 
        end as Weekly_Sales_Adjusted
    from(
        SELECT
            Store, 
            Dept, 
            Week, 
            Weekly_Sales,
            case 
                when Week = 52 then lag(Weekly_Sales) over(partition by Store, Dept) 
            end as last_sales
        from Final)""")

In [None]:
sample_submission['Weekly_Sales'] = Final_adj['Weekly_Sales_Adjusted']
sample_submission.to_csv('submission.csv',index=False)