# Walmart: Sales Forecast 

## Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split

from sklearn import metrics, ensemble, linear_model

import xgboost as xgb

import eli5
from eli5.sklearn import PermutationImportance

In [None]:
INPUT_DIR = '/kaggle/input/walmart-recruiting-store-sales-forecasting'
FEATURES_PATH = './features.csv'
STORES_PATH = './stores.csv'
TEST_PATH = './test.csv'
TRAIN_PATH = './train.csv'
SAMPLE_SUBMISSION_PATH = './sampleSubmission.csv'
RANDOM_STATE = 7

In [None]:
!unzip "$INPUT_DIR/*.csv.zip"
!cp "$INPUT_DIR/stores.csv" "/kaggle/working/"
!ls

## Inspect and Cleanup

In [None]:
# Load datasets
features_raw = pd.read_csv(FEATURES_PATH)
stores_raw = pd.read_csv(STORES_PATH)
train_raw = pd.read_csv(TRAIN_PATH)
test_raw = pd.read_csv(TEST_PATH)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

### Check if there are duplicated rows

In [None]:
print(f'Features duplicates: {features_raw.duplicated().sum()}')
print(f'Stores duplicates: {stores_raw.duplicated().sum()}')
print(f'Train duplicates: {train_raw.duplicated().sum()}')

In [None]:
print(f'Store types: {stores_raw["Type"].unique()}')

### Check if Store IDs match. This is important to see if there are missing store features.

In [None]:
print(f'Match: {sum(train_raw["Store"].unique() == features_raw["Store"].unique()) == 45}')

### Join features, stores and train data

In [None]:
all_data = train_raw.merge(stores_raw, how='inner', on='Store')\
    .merge(features_raw, how='inner', on=['Store', 'Date', 'IsHoliday'])\
    .reset_index(drop=True).sort_values(['Store', 'Dept', 'Date'])
test = test_raw.merge(stores_raw, how='inner', on='Store')\
    .merge(features_raw, how='inner', on=['Store', 'Date', 'IsHoliday'])\
    .reset_index(drop=True).sort_values(['Store', 'Dept', 'Date'])

In [None]:
# Convert to datetime
all_data['Date'] = pd.to_datetime(all_data['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# New date features
all_data['Day'] = all_data['Date'].dt.isocalendar().day
all_data['Week'] = all_data['Date'].dt.isocalendar().week
all_data['Month'] = all_data['Date'].dt.month
all_data['Year'] = all_data['Date'].dt.year

test['Day'] = test['Date'].dt.isocalendar().day
test['Week'] = test['Date'].dt.isocalendar().week
test['Month'] = test['Date'].dt.month
test['Year'] = test['Date'].dt.year

In [None]:
# Show NA quantities
all_data.isna().sum()

In [None]:
all_data.fillna(0, inplace=True)

In [None]:
all_data

### Descriptive statistics over dataset

In [None]:
all_data.describe().transpose()

#### Weekly Sales has negative values and very large max value, it seems to be outliers or anomalies.
#### Also Markdown 2 and 3 have negative values.

In [None]:
plt.figure(figsize=(20,4))
plt.title('Weekly Sales Distribution')
sns.boxplot(data=all_data, x='Weekly_Sales', palette='Set3')
plt.show()

#### This box plot shows many outliers. But in the folowing line plot, we see that the outliers occurs principaly in the weeks with holidays, every year, like the weeks of Thanksgiving and Christmas.

#### Negative sales will be removed

In [None]:
print(f'Total before remove negative weekly sales: {all_data.shape}')

In [None]:
all_data = all_data[all_data['Weekly_Sales'] >= 0]
all_data.reset_index(inplace=True, drop=True)

In [None]:
print(f'Total after remove negative weekly sales: {all_data.shape}')

#### Negative markdowns turns to zero

In [None]:
all_data.loc[all_data['MarkDown2'] < 0, 'MarkDown2'] = 0
all_data.loc[all_data['MarkDown3'] < 0, 'MarkDown3'] = 0

In [None]:
all_data[all_data['IsHoliday'] == True]['Week'].unique()

In [None]:
arrowprops = dict(arrowstyle='-|>', color='#c449cc', linewidth=2)

plt.figure(figsize=(12,6))
plt.title('Mean of Sales over weeks')
# Mean of sales grouped by weeks
line1 = sns.lineplot(data=all_data, x='Week', y='Weekly_Sales', hue='Year', estimator='mean', ci=None, palette='Set2')
line1.annotate(text="Thanskgiving", xy=(47,22000), xytext=(40,22000), arrowprops=arrowprops)
line1.annotate(text="Christmas", xy=(51,26500), xytext=(45,26500), arrowprops=arrowprops)
plt.show()

## Features Relations

In [None]:
all_data_corr = all_data.corr(method='pearson')
mask = np.triu(np.ones_like(all_data_corr, dtype=bool))

plt.subplots(figsize=(12, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.title('Pearson Correlation Between Features Heatmap')
sns.heatmap(all_data_corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, annot=True, fmt='.2f')
plt.show()

In [None]:
plt.subplots(figsize=(12, 9))
plt.title('Pearson Correlation Between Features and Weekly Sales')
weekly_corr = pd.DataFrame(data=all_data_corr.iloc[2,:], index=all_data_corr.iloc[2,:].index ).sort_values('Weekly_Sales', ascending=False)
sns.barplot(data=weekly_corr.iloc[1:], x=weekly_corr.iloc[1:].index, y='Weekly_Sales')
plt.xticks(rotation=45)
plt.show()

#### All correlations are low, but there is something interesting in the positive correlation with store size - People seems to buy more in bigger stores.

#### As we don't have relevant linear relationship, I will verify the Mutual Information (MI) metric, because it can verify any kind of relationship (not only linear like correlation). 

In [None]:
def make_mi_scores(X, y):
    X = X.copy()

    for colname in X.select_dtypes(['object', 'category', 'datetime64[ns]']):
        if X[colname].dtype == 'datetime64[ns]':
            X[colname], _ = X[colname].dt.strftime('%Y-%m-%d').factorize()
        else:
            X[colname], _ = X[colname].factorize()

    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    
    mi_scores = pd.Series(mi_scores, name='MI Scores', index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)

    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)

    sns.barplot(y=width, x=scores, orient='h')
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
mi_features = weekly_corr[1:].index
sns.relplot(
    x="value", y="Weekly_Sales", col="variable",
    data=all_data.melt(id_vars="Weekly_Sales", value_vars=mi_features),
    col_wrap=4,
    facet_kws=dict(sharex=False),
);

#### Some insigths:

- Fuel price not affect Weekly Sales;
- People buy more when temperature is between 20ºF and 80ºF: Irrelevant;
- CPI not affects the sales in a relevante way, with high CPI people continue buying;
- With low unemploymnent rate, higher are the sales.

In [None]:
mi_X = all_data.drop('Weekly_Sales', axis=1).copy()
mi_y = all_data['Weekly_Sales'].copy()

mi_scores = make_mi_scores(mi_X, mi_y)

In [None]:
print(mi_scores)

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

#### One more time, we see low relationship between features and targets. This means that the features, individualy, cannot explain with higher confidence the increase or decrease of weekly sales.

## Feature Enginering

In [None]:
train_data = all_data.copy()

In [None]:
test.isna().sum()[test.isna().sum() > 0]

In [None]:
plt.title('CPI Distribution')
sns.boxplot(y=test['CPI'])

In [None]:
plt.title('Unemployment Distribution')
sns.boxplot(y=test['Unemployment'])

#### How the bloxplots displayed, the median line is more centralized, so they seems to be a symetrical distribution. For symetrical distributions the mean explain more the distribution tendency than the median, so I will fill NA values with the mean.

In [None]:
test['CPI'].fillna(test['CPI'].mean(), inplace=True)
test['Unemployment'].fillna(test['Unemployment'].mean(), inplace=True)

In [None]:
test.fillna(0, inplace=True)

In [None]:
test.isna().sum()

### Encode some categorical features

In [None]:
store_types = dict((v, k) for k, v in enumerate(stores_raw['Type'].unique(), 1))

In [None]:
print(f'Store types: {store_types}')

In [None]:
train_data['IsHoliday'] = train_data['IsHoliday'].apply(lambda x: 1 if x else 0)
train_data['Type'] = train_data['Type'].apply(lambda x: store_types[x])

test['IsHoliday'] = test['IsHoliday'].apply(lambda x: 1 if x else 0)
test['Type'] = test['Type'].apply(lambda x: store_types[x])

## Prepare data

In [None]:
X = train_data.drop(['Date', 'Weekly_Sales'], axis=1).copy()
y = train_data['Weekly_Sales'].copy()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=RANDOM_STATE, test_size=0.1)

## Baseline model

#### Competition Metric: $\textrm{WMAE} = \frac{1}{\sum{w_i}} \sum_{i=1}^n w_i | y_i - \hat{y}_i |$

In [None]:
def weighted_mae(dataset, expected, predicted):
    weights = dataset['IsHoliday'].apply(lambda x: 5 if x else 1)
    
    return np.round(np.sum(weights * abs(expected - predicted)) * (1/np.sum(weights)), 2)

#### Check if the features that before showed weak correlation, realy has low influency in a model prediction if they were shuffled. I will check that using Permutation Importance.

In [None]:
xgb_r = xgb.XGBRegressor(random_state=RANDOM_STATE, objective='reg:squarederror')

In [None]:
xgb_r.fit(X_train.values, y_train)

In [None]:
permutations = PermutationImportance(xgb_r, random_state=RANDOM_STATE).fit(X_valid, y_valid)
features_pi = eli5.show_weights(permutations, top=len(X_train.columns), feature_names = X_valid.columns.tolist())

In [None]:
features_weights = eli5.show_weights(permutations, top=len(X_train.columns), feature_names = X_valid.columns.tolist())

In [None]:
pd.read_html(features_weights.data)[0]

#### To understand Permutation Importance, look at the values of weight. For example, for Dept, the first value of weight shows how much the 'accuracy' of the model can decrease if Dept is random shuffled, the second value shows how the first value change from one shuffle to another in the various internals shuffles of the Permutation Importance algorithm.

#### With this observation, and other made before, I will select only relevant features for the final model

In [None]:
relevant_features = ['Dept', 'Size', 'Store', 'Week', 'Type', 'IsHoliday', 'Month', 'Year']

In [None]:
X_train_relevant = X_train[relevant_features]
X_valid_relevant = X_valid[relevant_features]

## Compare some models

In [None]:
base_models = {
    'xgbRegressor': xgb.XGBRegressor(random_state=RANDOM_STATE, objective='reg:squarederror'),
    'randomForest': ensemble.RandomForestRegressor(random_state=RANDOM_STATE),
    'extraTrees': ensemble.ExtraTreesRegressor(bootstrap = True, random_state=RANDOM_STATE),
}

In [None]:
for model_name, model in base_models.items():
    model.fit(X_train_relevant.values, y_train)
    
    predictions = model.predict(X_valid_relevant.values)
    
    print(f'Model: {model_name}: \
          \n - WMAE {weighted_mae(X_valid_relevant, y_valid, predictions)} \
          \n - RMSE {metrics.mean_squared_error(y_valid, predictions, squared=False):.2f}')

#### Random Forest showed the best result, so I will proceed with it

In [None]:
random_forest = ensemble.RandomForestRegressor(n_estimators=58, max_depth=27, 
                                               min_samples_split=3, min_samples_leaf=1, n_jobs=-1,
                                               random_state=RANDOM_STATE)

In [None]:
random_forest.fit(X_train_relevant.values, y_train)

In [None]:
valid_predictions = random_forest.predict(X_valid_relevant.values)

In [None]:
week_valid_preds = pd.DataFrame({'week': X_valid_relevant['Week'], 'predictions': valid_predictions, 'expected': y_valid})

In [None]:
print(f'- WMAE {weighted_mae(X_valid_relevant, y_valid, valid_predictions)}\
      \n- RMSE {metrics.mean_squared_error(y_valid, predictions, squared=False):.2f}')

## Test and Submission file generation

In [None]:
X_test = test[relevant_features]

In [None]:
test_predictions = random_forest.predict(X_test.values)

In [None]:
sample_submission['Weekly_Sales'] = test_predictions

In [None]:
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', index=False)