# 1 - Importing used Libraries

Importing all the libraries used in this work

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import missingno as msno
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import shap
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install xgboost==1.0.0

In [None]:
import xgboost

# 2 - Used Functions

This cell contains all the functions built for use in this work

In [None]:
class CreateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X['week_of_year'] = pd.to_datetime(X.Date).dt.weekofyear
        X['year'] = pd.to_datetime(X.Date).dt.year

        X = X.merge(df_4holidays[['year', 'current_week', 'week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'current_week'])

        X = X.merge(df_4holidays[['year', 'last_week', 'last_week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'last_week'])

        X = X.merge(df_4holidays[['year', 'next_week', 'next_week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'next_week'])
        
        X['prop_to_buy'] =  ((X.Temperature * (100 - X.Unemployment) ) / (X.CPI * X.Fuel_Price ))
        X['move_cost'] = X.CPI / X.Fuel_Price
        X['revenue_potential'] = (100 * X.Unemployment) * X.Size
        return X

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.Store = X.Store.astype(str)
        X.Dept = X.Dept.astype(str)
        X.Type = X.Type.astype(str)
        X.week_holiday = X.week_holiday.astype(str)
        X.last_week_holiday = X.last_week_holiday.astype(str)
        X.next_week_holiday = X.next_week_holiday.astype(str)
        X = pd.get_dummies(X)
        return X[FEATURES_TO_MODEL]
    
class FeatureSelector1(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.Store = X.Store.astype(str)
        X.Dept = X.Dept.astype(str)
        X.Type = X.Type.astype(str)
        X.week_holiday = X.week_holiday.astype(str)
        X.last_week_holiday = X.last_week_holiday.astype(str)
        X.next_week_holiday = X.next_week_holiday.astype(str)
        X = pd.get_dummies(X)
        return X


class FillNaValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.MarkDown1 = X.MarkDown1.fillna(X.MarkDown1.dropna().median())
        X.next_week_holiday.fillna('None', inplace=True)
        X.last_week_holiday.fillna('None', inplace=True)
        X.week_holiday.fillna('None', inplace=True)
        X.Unemployment.fillna(X.Unemployment.dropna().median(), inplace=True)
        X.IsHoliday.fillna(0, inplace=True)
        X.prop_to_buy.fillna(X.prop_to_buy.dropna().median(), inplace=True)
        X.move_cost.fillna(X.move_cost.dropna().median(), inplace=True)
        X.revenue_potential.fillna(X.revenue_potential.dropna().median(), inplace=True)
        return X

    
def train_linear_regression(X_train, y_train, X_val, y_val):
    lr = linear_model.ElasticNet(random_state=42)
    lr.fit(X_train, y_train)
    print('R^2 = {}'.format(r2_score(y_val, lr.predict(X_val))))
    print('MAE = {}'.format(mean_absolute_error(y_val, lr.predict(X_val)) ))
    print('RMSE = {}'.format(mean_squared_error(y_val, lr.predict(X_val), squared=False) ))
    # cross_val_predict returns an array of the same size as `y` where each entry
    # is a prediction obtained by cross validation:
    predicted = cross_val_predict(lr, X_train, y_train, cv=5)
    fig, ax = plt.subplots()
    ax.scatter(y_train, predicted, edgecolors=(0, 0, 0))
    ax.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    return lr  

def train_xgboost_regressor(X_train, y_train, X_val, y_val):
    xgb_model = xgboost.XGBRegressor(random_state=42, n_jobs=-1)
    xgb_model.fit(X_train, y_train)
    print('R^2 = {}'.format(r2_score(y_val, xgb_model.predict(X_val))))
    print('MAE = {}'.format(mean_absolute_error(y_val, xgb_model.predict(X_val)) ))
    print('RMSE = {}'.format(mean_squared_error(y_val, xgb_model.predict(X_val), squared=False) ))
    predicted = cross_val_predict(xgb_model, X_train, y_train, cv=5)
    fig, ax = plt.subplots()
    ax.scatter(y_train, predicted, edgecolors=(0, 0, 0))
    ax.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    return xgb_model

def ensemble_xgb_elastic_net(model_1, model_2, X_train_1, X_train_2, y_train, X_val_1, X_val_2, y_val):
    train_pred1 = model_1.predict(X_train_1)
    train_pred2 = model_2.predict(X_train_2)
    val_pred1 = model_1.predict(X_val_1)
    val_pred2 = model_2.predict(X_val_2)
    df_train = pd.DataFrame({'feat_model_1': train_pred1, 'feat_model_2': train_pred2})
    df_val = pd.DataFrame({'feat_model_1': val_pred1, 'feat_model_2': val_pred2})
    model_lr = linear_model.LinearRegression()
    model_lr.fit(df_train, y_train)
    print('R^2 = {}'.format(r2_score(y_val, model_lr.predict(df_val))))
    print('MAE = {}'.format(mean_absolute_error(y_val, model_lr.predict(df_val)) ))
    print('RMSE = {}'.format(mean_squared_error(y_val, model_lr.predict(df_val), squared=False) ))
    return model_lr


# 3 - Loading the database

In this step I am loading all data source given for this problem

In [None]:
df_stores = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv', sep=',')
df_features = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip', sep=',')
df_train = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip', sep=',')
df_teste = pd.read_csv('/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip', sep=',')

# 4 - Exploratory Analysis

In this step we will try to explore and understand the data provided ... it is a step to be comfortable with the dataset that we will use in the modeling, and to obtain "insights" from the data

# 4.1 - Stores dataset

First step is to understand the data of each dataset source. I am starting with "stores.csv"

In [None]:
df_stores.shape

In [None]:
df_stores.info()

In [None]:
df_stores.head()

In [None]:
df_stores.Store.nunique()

# 4.1.1 - Type of Store

Let's go deeper in store types:

In [None]:
df_stores['Type'].value_counts()

In [None]:
ax = sns.barplot(x=sorted(df_stores.Type.unique()),y=df_stores['Type'].value_counts(),
                 palette="Blues_d")
plt.xlabel('Store types')
plt.ylabel("Quantity")
plt.title('Quantity analysis of store types')
sns.despine()
plt.show();


Although we don't know each type of store is, we can realize that the types "A" and "B" is at least 80% overall.

# 4.1.2 - Store Sizes

Now, let's go deeper in the store sizes

In [None]:
sns.distplot(df_stores['Size']);
sns.despine();

In [None]:
df_stores['Size'].plot.hist(density=True);
plt.xlabel('Store Size');
plt.title('Analysing the Size distribution ');
plt.show();

In [None]:
for t in df_stores['Type'].unique():
    df_stores.loc[df_stores.Type==t, 'Size'].plot.hist(density=False, label=t, alpha=0.8);
    plt.xlabel('Store Size');
plt.legend(title='Type of store');
plt.title('Analysing the Size distribution by Type of Store');
plt.show();

In [None]:
for t in df_stores['Type'].unique():
    print('Analysing the Size distribution for Store Type {}'.format(t));
    display(df_stores.loc[df_stores.Type==t, 'Size'].describe())
    df_stores.loc[df_stores.Type==t, 'Size'].plot.hist(density=True, label=t, alpha=0.8);
    plt.xlabel('Store Size');
    plt.legend(title='Type of store');
    plt.title('');
    plt.show();

Interesting! Type "A" stores are the largest, with an average size of ~ 180kunits of measurement, followed by type "B", with average size of ~ 102k measurement units, and finally type C stores with average size of ~ 40k measurement units .


# 4.2 - Features dataset

Now let's explore the feature dataset!

In [None]:
df_features.shape

In [None]:
df_features.info()

In [None]:
df_features.head()

# 4.2.1 - Date

We will analyze if all stores have the same number of lines (sales dates) and if the minimum and maximum dates coincide.


In [None]:
df_features.groupby(['Store']).agg({'Date':['count', 'min','max']})

Nice! We have 182 records for each store, and the records range from 2010-02-05 to 2013-07-26


# 4.2.2 - Temperature

Let's understand how the temperature distribution of the base is based on the dates


In [None]:
df_features.Temperature.plot.hist(density=True, alpha=0.85);
plt.xlabel('Temperature');
plt.title('Temperature Distribution for the whole dataset');
plt.show()

Now, grouping by Stores, let's analyze how the temperature variable behaves

In [None]:
df_features.groupby(['Store']).agg({'Temperature': ['min','mean','max', 'std']})

Most stores have an average temperature between 50-70. What draws attention is the coldest store of all (Store 7) ... with an average temperature of ~ 37, and a minimum of -7! Is there a relationship between the temperature of the place and the amount of sales? In warmer places do people buy more? We will study this soon.


The table above also shows that the hottest store is 33! Let's compare the temperature distribution of the warmest store with the coldest store.


In [None]:
df_features.loc[df_features.Store==7, 'Temperature'].plot.hist(label='Store 7', density=True, alpha=.6);
df_features.loc[df_features.Store==33, 'Temperature'].plot.hist(label='Store 33', density=True, alpha=.6);
plt.xlabel('Temperature');
plt.legend();
plt.title('Comparing the temperature of the hottest store to the coldest store');
plt.show();

# 4.2.3 - Fuel Price

Let's look at the fuel price distribution


In [None]:
df_features.Fuel_Price.plot.hist();
plt.xlabel('Fuel Price');
plt.title('Fuel Price Distribution for the whole dataset');
plt.show()

Below we will analyze the evolution of the fuel price over time for each Store


In [None]:
for i in df_features.Store.unique():
    df_features.loc[df_features.Store==i, 'Fuel_Price'].plot();
    plt.title('Fuel Price through time for Store {}'.format(i));
    plt.ylabel('Fuel Price');
    plt.xticks([]);
    plt.xlabel('Time {} to {}'.format(min(df_features.Date),max(df_features.Date) ));
    plt.show();

On average, the price of fuel for all stores followed the same upward and downward trend during the period. It is worth noting that the regions of Stores 44, 41, 38, 33, 32, 28, 17, 16, 13 and 7. Perhaps these stores are in nearby regions and some other external factor may have influenced the further decline in those regions.


# 4.2.4 - MarkDown

Let's analyze and understand the "MarkDown" variable. anonymized data related to promotional markdowns that Walmart is running.


In [None]:
df_features.sample()

In [None]:
for mark in [i for i in df_features.columns if 'Mark' in i]:
    df_features[mark].plot.hist(density=False);
    plt.title('Distribuition of {}'.format(mark));
    plt.show();  

The "Markdown" column with more information (greater dispersion in the distribution) is Markdown1. Let's look at it in more detail


In [None]:
df_features.groupby(['Store']).agg({'MarkDown1':['min','mean', 'max']})

In [None]:
for st in df_features.Store.unique():
    df_features.loc[df_features.Store==st, 'MarkDown1'].plot.hist(density=True, alpha=0.8, label=st);
    plt.xlabel('MarkDown1');
    plt.title('MarkDown1 distribution for Store {}'.format(st));
    plt.show();

# 4.2.5 - CPI (The Consumer Price Index)

In [None]:
df_features.CPI.plot.hist();

In [None]:
for st in df_features.Store.unique():
    df_features.loc[df_features.Store==st, 'CPI'].plot.hist(density=True, alpha=0.8, label=st);
    plt.xlabel('CPI value');
    plt.title('CPI distribution for Store {}'.format(st));
    plt.show();

In [None]:
df_features.groupby('Store').agg({'CPI': ['min', 'mean', 'max']})

# 4.2.5 - Unemployment

In [None]:
df_features.sample()

In [None]:
df_features.Unemployment.plot.hist();

In [None]:
df_features.groupby('Store').agg({'Unemployment':['min', 'mean', 'max']})

Stores in regions with higher unemployment rates are expected to have less sales


# 4.2.6 - Is Holiday

In [None]:
df_features.IsHoliday.dtype

In [None]:
df_features.IsHoliday = df_features.IsHoliday.astype(int)

In [None]:
df_features.groupby(['Store']).agg({'IsHoliday':sum})

# 4.2.7 - CPI vs Fuel Price

In [None]:
df_features.sample()

In [None]:
ax = sns.scatterplot(x="CPI", y="Fuel_Price",hue='Store', data=df_features)

# 4.2.8 - CPI vs Unemployment

In [None]:
ax = sns.scatterplot(x="CPI", y="Unemployment",hue='Store', data=df_features)

# 4.3 - Merging datasets

In [None]:
df_train.head()

In [None]:
df_teste.head()

In [None]:
df_stores.head()

In [None]:
df_features.head()

In [None]:
(df_train.Store.dtype == df_stores.Store.dtype, df_teste.Store.dtype == df_stores.Store.dtype )

In [None]:
df_temp_train = df_train.merge(df_stores, how='left', on='Store')
df_temp_test = df_teste.merge(df_stores, how='left', on='Store')

In [None]:
df_temp_test.sample()

In [None]:
df_temp_train.sample()

In [None]:
(df_temp_train.Store.dtype == df_features.Store.dtype, df_temp_train.Date.dtype == df_features.Date.dtype)

In [None]:
(df_temp_test.Store.dtype == df_features.Store.dtype, df_temp_test.Date.dtype == df_features.Date.dtype)

In [None]:
df_train_full = df_temp_train.merge(df_features, how='left', on=['Store', 'Date'])

In [None]:
df_test_full = df_temp_test.merge(df_features, how='left', on=['Store', 'Date'])

In [None]:
df_train_full.shape

In [None]:
df_test_full.shape

In [None]:
df_train_full.head()

In [None]:
df_train_full.IsHoliday_x.astype(int).sum() == df_train_full.IsHoliday_y.sum()

In [None]:
df_train_full.drop('IsHoliday_x', axis=1,inplace=True)

In [None]:
df_train_full.rename(columns={'IsHoliday_y':'IsHoliday'}, inplace=True)

In [None]:
df_test_full.head()

In [None]:
df_test_full.IsHoliday_x.astype(int).sum() == df_test_full.IsHoliday_y.sum()

In [None]:
df_test_full.drop('IsHoliday_x', axis=1,inplace=True)

In [None]:
df_test_full.rename(columns={'IsHoliday_y':'IsHoliday'}, inplace=True)

# 4.4 - Plug-in the Four Largest Holidays in the dataset

The weeks including these holidays are weighted five times higher in the evaluation than non-holiday weeks:

- **Super Bowl**: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13
- **Labor Day**: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13
- **Thanksgiving**: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13
- **Christmas**: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13

In [None]:
dict_lgt_hlds ={'Super_Bowl': ['12-Feb-10', '11-Feb-11', '10-Feb-12', '8-Feb-13']
               ,'Labor_Day': ['10-Sep-10', '9-Sep-11', '7-Sep-12', '6-Sep-13']
                ,'Thanksgiving': ['26-Nov-10', '25-Nov-11', '23-Nov-12', '29-Nov-13']
                ,'Christmas': ['31-Dec-10', '30-Dec-11', '28-Dec-12', '27-Dec-13']
               }

In [None]:
lista = []

In [None]:
for hol in dict_lgt_hlds.keys():
    for dt in dict_lgt_hlds[hol]:
        lista.append([hol, pd.to_datetime(dt).year, pd.to_datetime(dt).week])

In [None]:
df_4holidays = pd.DataFrame(lista, columns=['week_holiday','year', 'current_week'])

In [None]:
df_4holidays['last_week'] =df_4holidays['current_week'] +1
df_4holidays['next_week'] =df_4holidays['current_week'] - 1
df_4holidays['last_week_holiday'] = df_4holidays['week_holiday']
df_4holidays['next_week_holiday'] = df_4holidays['week_holiday']

In [None]:
df_4holidays

Transforming the dataset for week view of the year:


In [None]:
df_train_full['week_of_year'] = pd.to_datetime(df_train_full.Date).dt.weekofyear
df_train_full['year'] = pd.to_datetime(df_train_full.Date).dt.year

In [None]:
sns.countplot(data=df_train_full, x='week_of_year');
plt.xticks(rotation=45);

In [None]:
sns.countplot(data=df_train_full, x='year');


Marking the weeks of big holidays

In [None]:
df_train_full_4h = df_train_full.merge(df_4holidays[['year', 'current_week', 'week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'current_week'])
    

Our hypothesis here is that the sales weeks before and after the big holidays are also affected! So we are going to mark these weeks on the bases.


marking the weeks before the big holidays


In [None]:
df_train_full_4h = df_train_full_4h.merge(df_4holidays[['year', 'last_week', 'last_week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'last_week'])

Marking the weeks after the holiday


In [None]:
df_train_full_4h = df_train_full_4h.merge(df_4holidays[['year', 'next_week', 'next_week_holiday']]
                                       , how='left', left_on=['year', 'week_of_year'],right_on=['year', 'next_week'])

In [None]:
pd.options.display.max_columns=None

In [None]:
df_train_full_4h.head()

# 5 - Bivariate Analysis


At this stage, the idea is to analyze the main variables of the base with the variable response of the problem, in this case the amount of weekly sales.

# 5.1 - Store x Weekly Sales

In [None]:
data = df_train_full_4h.groupby('Store').agg({'Weekly_Sales':'mean', 'Type':'max', 'Size':'mean'}).reset_index()

In [None]:
data.groupby('Type').agg({'Weekly_Sales':'mean'})

In [None]:
ax = sns.scatterplot(x="Store", y="Weekly_Sales",hue='Type', data=data)

There is a bias that bigger stores will sell more, so let's weigh sales by store size and see what happens


In [None]:
data['sales_per_size'] = data['Weekly_Sales'] /data['Size']

In [None]:
data.groupby('Type').agg({'sales_per_size':'mean'})

In [None]:
ax = sns.scatterplot(x="Store", y="sales_per_size",hue='Type', data=data)

Wow! Type C stores are twice as efficient in their sales considering their size ... given that a larger store can generate a higher fixed cost.


# 5.2 - Dept vs Sales

Are there departments that sell more than others?


In [None]:
data = df_train_full_4h.groupby(['Type','Dept']).agg({'Weekly_Sales':'mean'}).reset_index()

In [None]:
ax = sns.scatterplot(x="Dept", y="Weekly_Sales",hue='Type' ,data=data)

We conclude that the store and department type variables discriminate well as to the gross value sold


# 5.3 - Temperature vs Sales

Does temperature influence people's propensity to spend more money?


In [None]:
data = df_train_full_4h.groupby(['Temperature']).agg({'Weekly_Sales':'mean'}).reset_index()

In [None]:
ax = sns.scatterplot(x="Temperature", y="Weekly_Sales" ,data=data)

In [None]:
data['temp_bins'] = pd.cut(data.Temperature, bins=10).astype(str)

In [None]:
ax = sns.lineplot(x="temp_bins", y="Weekly_Sales" ,data=data.groupby('temp_bins').agg({'Weekly_Sales':'mean'}).reset_index())
plt.xticks(rotation=45);

In [None]:
ax = sns.scatterplot(x="Temperature", y="Weekly_Sales",hue='Type' ,data=df_train_full_4h)

Analyzing the temperature alone does not tell us much, the biggest purchases happen at the average temperature observed in the dataset, which is the average temperature for the regions analyzed


# 5.4 - Fuel Price vs Sales

Does cheaper gasoline influence people's propensity to go out more to buy?


In [None]:
ax = sns.scatterplot(x="Fuel_Price", y="Weekly_Sales",hue='Type' ,data=df_train_full_4h)

Another variable that alone doesn't tell us much, later we will try to create new variables using the fuel price 


# 5.5 - MarkDown1 vs Sales

How does this promotion related variable behave?


In [None]:
ax = sns.scatterplot(x="MarkDown1", y="Weekly_Sales",hue='Type' ,data=df_train_full_4h)

Apparently MarkDown1 has a slightly negative relationship to sales ... lower MarkDown, higher sales?


# 5.6 - CPI vs Sales

Do regions with a lower CPI have a propensity to spend more?

In [None]:
ax = sns.scatterplot(x="CPI", y="Weekly_Sales",hue='Type' ,data=df_train_full_4h)

Nice! As expected, regions with a lower CPI have a slight tendency to spend more than regions with a higher CPI, customers who experience less price increases spend more


# 5.7 - Unemployment vs Sales

Do places with a lower unemployment rate spend more money?


In [None]:
ax = sns.scatterplot(x="Unemployment", y="Weekly_Sales",hue='Type' ,data=df_train_full_4h)

Another hypothesis confirmed here, stores in regions with a high unemployment rate, spend less !!


# 5.8 - Holiday	vs Sales

Certainly these special holidays drive people to spend more, shall we check?


In [None]:
df_train_full_4h.groupby('IsHoliday').agg({'Weekly_Sales':'mean'})

In [None]:
ax = sns.lineplot(x="IsHoliday", y="Weekly_Sales", markers=True ,data=df_train_full_4h.groupby('IsHoliday').agg({'Weekly_Sales':'mean'}).reset_index())
plt.xticks(rotation=45);

In [None]:
df_train_full_4h.groupby('week_holiday').agg({'Weekly_Sales':'mean'})

In [None]:
ax = sns.lineplot(x="week_holiday", y="Weekly_Sales", markers=True ,data=df_train_full_4h.groupby('week_holiday').agg({'Weekly_Sales':'mean'}).reset_index())
plt.xticks(rotation=45);

In [None]:
df_train_full_4h.groupby('last_week_holiday').agg({'Weekly_Sales':'mean'})

In [None]:
ax = sns.lineplot(x="last_week_holiday", y="Weekly_Sales", markers=True ,data=df_train_full_4h.groupby('last_week_holiday').agg({'Weekly_Sales':'mean'}).reset_index())
plt.xticks(rotation=45);

In [None]:
df_train_full_4h.groupby('next_week_holiday').agg({'Weekly_Sales':'mean'})

In [None]:
ax = sns.lineplot(x="next_week_holiday", y="Weekly_Sales", markers=True ,data=df_train_full_4h.groupby('next_week_holiday').agg({'Weekly_Sales':'mean'}).reset_index())
plt.xticks(rotation=45);

Weeks that discriminate most: Thanksgiving week, and one week before Christmas


# 6 - Feature Engineering

In [None]:
df_train_full_4h.describe()

# 6.1 - Creating new features

# 6.1.1 - The region's propensity to buy

We know that a low CPI rate, a low unemployment rate, and cheap fuel price are factors that encourage people to spend money. So we are going to create a variable with these 3 factors, which we will call propensity to buy the region


In [None]:
df_train_full_4h['prop_to_buy'] =  ((df_train_full_4h.Temperature * (100 - df_train_full_4h.Unemployment) ) / (df_train_full_4h.CPI * df_train_full_4h.Fuel_Price ))

In [None]:
ax = sns.scatterplot(x="prop_to_buy", y="Weekly_Sales", hue='Type' ,data=df_train_full_4h)
#plt.xticks(rotation=45);

In [None]:
g = sns.jointplot(x="prop_to_buy", y="Weekly_Sales" ,data=df_train_full_4h,
                  kind="reg", truncate=False,
                  #xlim=(0, 60), ylim=(0, 12),
                  color="m"
                  #, height=7
                 )

# 6.1.2 - Locomotion cost


Here we will try to create a variable that I will call "transportation cost", which relates the CPI and the fuel price


In [None]:
df_train_full_4h['move_cost'] = df_train_full_4h.CPI / df_train_full_4h.Fuel_Price

In [None]:
ax = sns.scatterplot(x="move_cost", y="Weekly_Sales", hue='Type' ,data=df_train_full_4h)
#plt.xticks(rotation=45);

In [None]:
g = sns.jointplot(x="move_cost", y="Weekly_Sales" ,data=df_train_full_4h,
                  kind="reg", truncate=False,
                  #xlim=(0, 60), ylim=(0, 12),
                  color="m"
                  #, height=7
                 )

# 6.1.3 - Revenue potential

Here I will create a variable that I call revenue potential, which relates the size of the store to the local unemployment rate


In [None]:
df_train_full_4h['revenue_potential'] = (100 * df_train_full_4h.Unemployment) * df_train_full_4h.Size

In [None]:
ax = sns.scatterplot(x="revenue_potential", y="Weekly_Sales", hue='Type' ,data=df_train_full_4h)

In [None]:
g = sns.jointplot(x="revenue_potential", y="Weekly_Sales" ,data=df_train_full_4h,
                  kind="reg", truncate=False,
                  #xlim=(0, 60), ylim=(0, 12),
                  color="m"
                  #, height=7
                 )

# 6.2 - Feature Selection

In this step we will prepare the dataset for the training, as we intend to use a linear regression, we have to analyze the correction of the variables, fill in missings, normalize numerical variables, among other treatments


In [None]:
msno.matrix(df_train_full_4h);

Let’s drop some variables that we found in the exploratory analysis that didn’t make sense in modeling


In [None]:
df_train_sel = df_train_full_4h.drop(['MarkDown2'
                                     ,'MarkDown3'
                                     ,'MarkDown4'
                                     ,'MarkDown5'
                                     ,'year'
                                     ,'Date'
                                     ,'current_week'
                                      ,'last_week'
                                      ,'next_week'
                                      ,'week_of_year'
                                     ], axis=1)

In [None]:
df_train_sel.MarkDown1 = df_train_sel.MarkDown1.fillna(df_train_sel.MarkDown1.dropna().median())

In [None]:
df_train_sel.next_week_holiday.fillna('None', inplace=True)
df_train_sel.last_week_holiday.fillna('None', inplace=True)
df_train_sel.week_holiday.fillna('None', inplace=True)

Variables to normalize


In [None]:
FET_TO_SCALER = [
    'Size'
    ,'Temperature'
    ,'Fuel_Price'
    ,'MarkDown1'
    ,'CPI'
    ,'Unemployment'
    ,'prop_to_buy'
    ,'move_cost'
    ,'revenue_potential'
]

In [None]:
scaler = StandardScaler()

In [None]:
df_train_sel[FET_TO_SCALER] = scaler.fit_transform(df_train_sel[FET_TO_SCALER])

In [None]:
df_train_sel.describe()

In [None]:
msno.matrix(df_train_sel);

In [None]:
df_train_sel.info()

In [None]:
df_train_sel.Store = df_train_sel.Store.astype(str)
df_train_sel.Dept = df_train_sel.Dept.astype(str)
df_train_sel.Type = df_train_sel.Type.astype(str)
df_train_sel.week_holiday = df_train_sel.week_holiday.astype(str)
df_train_sel.last_week_holiday = df_train_sel.last_week_holiday.astype(str)
df_train_sel.next_week_holiday = df_train_sel.next_week_holiday.astype(str)

Transforming categorical variables into binary variables


In [None]:
df_train_dummies = pd.get_dummies(df_train_sel)

In [None]:
df_train_dummies.shape

In [None]:
df_train_dummies.sample(3)

In [None]:
list(df_train_dummies.columns)

In [None]:
GP1 = ['Size'
       ,'Temperature'
       ,'Fuel_Price'
       ,'MarkDown1'
       ,'CPI'
       ,'Unemployment'
       ,'IsHoliday'
       ,'prop_to_buy'
       ,'move_cost'
       ,'revenue_potential'
    
]
GP2 = [
    'week_holiday_Christmas'
    ,'week_holiday_Labor_Day'
    ,'week_holiday_None'
    ,'week_holiday_Super_Bowl'
    ,'week_holiday_Thanksgiving'
    ,'last_week_holiday_Labor_Day'
    ,'last_week_holiday_None'
    ,'last_week_holiday_Super_Bowl'
    ,'last_week_holiday_Thanksgiving'
    ,'next_week_holiday_Christmas'
    ,'next_week_holiday_Labor_Day'
    ,'next_week_holiday_None'
    ,'next_week_holiday_Super_Bowl'
    ,'next_week_holiday_Thanksgiving'
]

GP3 = [
    'Type_A'
    ,'Type_B'
    ,'Type_C'
]

Let's look at the correlation of the variables. The idea here is to eliminate variables that are very correlated with each other, either positively or negatively. In my criteria I will remove variables that have a correlation module> 0.6

In [None]:
f, ax = plt.subplots(figsize=(11, 6))
sns.heatmap(df_train_dummies[GP1].corr(), annot=True, linewidths=.5, ax=ax);

In [None]:
f, ax = plt.subplots(figsize=(15, 8))
sns.heatmap(df_train_dummies[GP2].corr(), annot=True, linewidths=.5, ax=ax);

In [None]:
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(df_train_dummies[GP3].corr(), annot=True, linewidths=.5, ax=ax);

Variables selected for modeling


In [None]:
FEATURES_TO_MODEL = [

 'MarkDown1',
 'Unemployment',
 'IsHoliday',
 'prop_to_buy',
 'move_cost',
 'revenue_potential',
 
 'Store_1',
 'Store_10',
 'Store_11',
 'Store_12',
 'Store_13',
 'Store_14',
 'Store_15',
 'Store_16',
 'Store_17',
 'Store_18',
 'Store_19',
 'Store_2',
 'Store_20',
 'Store_21',
 'Store_22',
 'Store_23',
 'Store_24',
 'Store_25',
 'Store_26',
 'Store_27',
 'Store_28',
 'Store_29',
 'Store_3',
 'Store_30',
 'Store_31',
 'Store_32',
 'Store_33',
 'Store_34',
 'Store_35',
 'Store_36',
 'Store_37',
 'Store_38',
 'Store_39',
 'Store_4',
 'Store_40',
 'Store_41',
 'Store_42',
 'Store_43',
 'Store_44',
 'Store_45',
 'Store_5',
 'Store_6',
 'Store_7',
 'Store_8',
 'Store_9',
 
 'Dept_1',
 'Dept_10',
 'Dept_11',
 'Dept_12',
 'Dept_13',
 'Dept_14',
 'Dept_16',
 'Dept_17',
 'Dept_18',
 'Dept_19',
 'Dept_2',
 'Dept_20',
 'Dept_21',
 'Dept_22',
 'Dept_23',
 'Dept_24',
 'Dept_25',
 'Dept_26',
 'Dept_27',
 'Dept_28',
 'Dept_29',
 'Dept_3',
 'Dept_30',
 'Dept_31',
 'Dept_32',
 'Dept_33',
 'Dept_34',
 'Dept_35',
 'Dept_36',
 'Dept_37',
 'Dept_38',
 'Dept_39',
 'Dept_4',
 'Dept_40',
 'Dept_41',
 'Dept_42',
 'Dept_43',
 'Dept_44',
 'Dept_45',
 'Dept_46',
 'Dept_47',
 'Dept_48',
 'Dept_49',
 'Dept_5',
 'Dept_50',
 'Dept_51',
 'Dept_52',
 'Dept_54',
 'Dept_55',
 'Dept_56',
 'Dept_58',
 'Dept_59',
 'Dept_6',
 'Dept_60',
 'Dept_65',
 'Dept_67',
 'Dept_7',
 'Dept_71',
 'Dept_72',
 'Dept_74',
 'Dept_77',
 'Dept_78',
 'Dept_79',
 'Dept_8',
 'Dept_80',
 'Dept_81',
 'Dept_82',
 'Dept_83',
 'Dept_85',
 'Dept_87',
 'Dept_9',
 'Dept_90',
 'Dept_91',
 'Dept_92',
 'Dept_93',
 'Dept_94',
 'Dept_95',
 'Dept_96',
 'Dept_97',
 'Dept_98',
 'Dept_99',
 
 'Type_A',
 
 'Type_C',
 
 'week_holiday_Christmas',
 'week_holiday_Labor_Day',
 'week_holiday_None',
 'week_holiday_Super_Bowl',
 'week_holiday_Thanksgiving',
 'last_week_holiday_Labor_Day',
 'last_week_holiday_None',
 'last_week_holiday_Super_Bowl',
 'last_week_holiday_Thanksgiving',
 'next_week_holiday_Christmas',
 'next_week_holiday_Labor_Day',
 'next_week_holiday_None',
 'next_week_holiday_Super_Bowl',
 'next_week_holiday_Thanksgiving'
]

# 7 - Pipeline to models

In this step we will create pipelines for handling the modeling dataset for both models we intend to use


In [None]:
pipeline_regression = Pipeline([
                                ('createFeatures', CreateFeatures())                         
                                ,('fillNaValues', FillNaValues()) 
                                ,('featureSelector', FeatureSelector())
                                ,('scaler', StandardScaler())
                                
                               ])

pipeline_xgboost = Pipeline([
                                ('createFeatures', CreateFeatures())
                                ,('featureSelector', FeatureSelector1())
                                
                               ])

# 8 - Modeling

# 8.1 - Linear Regression (Elastic Net)

In this step, we will train a Linear Regression Elastic Net that has the regularization parameters l1 and l2 combined (Lasso + Ridge). This is a simpler model computationally speaking and has low variance.


In [None]:
X = df_train_full.drop('Weekly_Sales', axis=1)
y = df_train_full['Weekly_Sales']

Separating the dataset between training and validation


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_val.shape

Transforming the training and validation dataset with the Pipeline made for regression


In [None]:
X_train_LR = pipeline_regression.fit_transform(X_train) 

In [None]:
X_val_LR = pipeline_regression.transform(X_val)

Training the model:


In [None]:
lr_model = train_linear_regression(X_train_LR, y_train, X_val_LR, y_val)

# 8.1.1 - Randomized Search for LR Elastic Net

We will try to better calibrate the parameters of Elastic Net and try to obtain better performance. In this step we will use a Randomized Search.


In [None]:
hyperparameters = {"max_iter": [1, 5, 10],
                      "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      "l1_ratio": np.arange(0.0, 1.0, 0.1)}
    
    
scoring = {'MAE': make_scorer(mean_absolute_error), 'r2': make_scorer(r2_score)}

# Create randomized search 5-fold cross validation 
rand_ser = RandomizedSearchCV(lr_model
                         , hyperparameters
                         , random_state=42
                        , n_iter=100
                         , cv=5
                         , verbose=0
                         , n_jobs=-1
                         , scoring=scoring
                         ,refit='r2'
                         )

# Fit randomized search
best_model_lr = rand_ser.fit(X_train_LR, y_train)

print('R^2 = {}'.format(r2_score(y_val, best_model_lr.predict(X_val_LR))))
print('MAE = {}'.format(mean_absolute_error(y_val, best_model_lr.predict(X_val_LR)) ))
print('RMSE = {}'.format(mean_squared_error(y_val, best_model_lr.predict(X_val_LR), squared=False) ))

# 8.2 - XGBOOST Regressor

In this step we will try to train a more complex model with a greater variance, a Gradient Boosting (decision tree ensemble), in this case the XGBOOST. This model is very useful because it requires little treatment in the input dataset, accepts nulls, can work well with correlated variables, and does not need to normalize variables, as in the case of Linear Regression


In [None]:
X_train_XG = pipeline_xgboost.fit_transform(X_train)

In [None]:
X_val_XG = pipeline_xgboost.transform(X_val)

In [None]:
xgb = train_xgboost_regressor(X_train_XG, y_train, X_val_XG, y_val)

Incredible performance!


# 8.2.1 - Model Interpretability - Shapley Values

Another nice thing about XGBOOST is that it has great interpretability of the most important variables, which is a very good thing to validate if the model is making sense, rationally speaking at the level of variables, and not just looking at performance metrics.


In this step, we will use a tool called Shapley Values. It has 3 dimensions to analyze. On the y-axis, it is the importance of the variable, from top to bottom (most important to least important). Axis x, it is the strength of the variable, if it points to the left, it contributes to a lower output value of the model, and to the right, a higher output value of the model. The other dimension is the intensity of the variable, represented by colors, a lower intensity represented by blue, and a higher intensity represented by red.


In [None]:
explainer = shap.TreeExplainer(xgb)

In [None]:
shap.initjs()
shap_values = explainer.shap_values(X_train_XG, check_additivity=False)
shap.summary_plot(shap_values, X_train_XG)

we see that the store size and departments are variable with high predictive power


# 8.3 - Ensemble XGBOOST + ELastic Net

This step, we go further! How about joining XGBOOST with Elastic Net? Although XGBOOST performed much better than regression, there must be cases where the regression shows predictions with a smaller error than XGBOOST, so we will use the prediction of the two models and train a third regression in order to obtain a smaller error . Does it work?


In [None]:
model_xgb_elatcnet = ensemble_xgb_elastic_net(model_1=lr_model
                         , model_2=xgb
                         , X_train_1=X_train_LR
                         , X_train_2=X_train_XG
                         , y_train = y_train
                         , X_val_1 = X_val_LR
                         , X_val_2 = X_val_XG
                         , y_val = y_val
                        )

It worked! We achieved a minor error by joining the two models!
