# Rossman Store Sales Prediction

Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality.

With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied. You are provided with historical sales data for 1,115 Rossmann stores. The task is to forecast the "Sales" column for the test set. Note that some stores in the dataset were temporarily closed for refurbishment.

Downloading the required libraries

In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 120)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline

matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
os.chdir("/kaggle/input/")

In [None]:
os.listdir()

In [None]:
ross_df = pd.read_csv('rossmann-store-sales/train.csv', low_memory=False)
store_df = pd.read_csv('rossmann-store-sales/store.csv')
test_df = pd.read_csv('rossmann-store-sales/test.csv')
submission_df = pd.read_csv('rossmann-store-sales/sample_submission.csv')

In [None]:
ross_df

In [None]:
store_df

In [None]:
test_df

In [None]:
submission_df

Merge the information from `store_df` into `train_df` and `test_df`.

In [None]:
merged_df = ross_df.merge(store_df, how='left', on='Store')
merged_test_df = test_df.merge(store_df, how='left', on='Store')

In [None]:
merged_df

### Data Preprocessing and Feature Engineering

In [None]:
merged_df.info()

In [None]:
def split_date(df):
  df['Date'] = pd.to_datetime(df['Date'])
  df['Year'] = df.Date.dt.year
  df['Month'] = df.Date.dt.month
  df['Day'] = df.Date.dt.day
  df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df

#### Stores Open/Closed

In [None]:
merged_df[merged_df.Open == 0].Sales.value_counts()

Instead of trying to model this relationship, it would be better to hard-code it in our predictions, and remove the rows where the store is closed. We won't remove any rows from the test set, since we need to make predictions for every row.

In [None]:
merged_df = merged_df[merged_df.Open == 1].copy()

#### Competition

In [None]:
merged_df

Computing the number of months for which a competitor has been open near the store

In [None]:
def comp_months(df):
  df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)
  df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)

In [None]:
comp_months(merged_df)
comp_months(merged_test_df)

In [None]:
merged_df[['Date', 'CompetitionDistance', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpen']].sample(20)

#### Additional Promotion

In [None]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) +  (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df[['Date', 'Promo2', 'Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval', 'Promo2Open', 'IsPromo2Month']].sample(20)

#### Identify Input and Target columns

In [None]:
merged_df.columns

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']

target_col = 'Sales'

In [None]:
inputs = merged_df[input_cols].copy()
target = merged_df[target_col].copy()

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
inputs

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

#### Imputing Missing Numerical Data

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
max_distance = inputs.CompetitionDistance.max()
max_distance

In [None]:
inputs['CompetitionDistance'].fillna(max_distance * 2, inplace=True)
test_inputs['CompetitionDistance'].fillna(max_distance * 2, inplace=True)

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

#### Scaling Numeric Values

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(inputs[numeric_cols])

In [None]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
inputs

#### Encoding Categorical Data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
encoder.fit(inputs[categorical_cols])

In [None]:
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [None]:
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

Extract all data for training

In [None]:
X = inputs[numeric_cols + encoded_cols]

In [None]:
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
X

### Train Gradient Boosting Model

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)

In [None]:
%%time
model.fit(X, target)

#### Prediction

In [None]:
preds = model.predict(X)

In [None]:
preds

#### Evaluating

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
  return mean_squared_error(a, b, squared=False)

In [None]:
rmse(preds, target)

In [None]:
merged_df.Sales.min(), merged_df.Sales.max()

In [None]:
plt.hist(merged_df.Sales.sample(10000));

#### Feature Importance

In [None]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
sns.barplot(data=importance_df.head(10), x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature');

### K-Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
kfold = KFold(n_splits=5)

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], target.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], target.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
import numpy as np

def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

In [None]:
preds = predict_avg(models, X)

In [None]:
preds

### HyperParameter Tuning and Regularisation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, train_targets, val_targets = train_test_split(X, target, test_size=0.1)

In [None]:
def test_params(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))


#### `n_estimators`


In [None]:
test_params(n_estimators=10)

In [None]:
test_params(n_estimators=30)

In [None]:
test_params(n_estimators=100)

#### `max_depth`

In [None]:
test_params(max_depth=5)

In [None]:
test_params(max_depth=10)

#### `learning_rate`

In [None]:
test_params(n_estimators=50, learning_rate=0.01)

In [None]:
test_params(n_estimators=50, learning_rate=0.1)

In [None]:
test_params(n_estimators=50, learning_rate=0.3)

### Making Predictions

In [None]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000,
                    learning_rate=0.2, max_depth=10, subsample=0.9,
                    colsample_bytree=0.7)

In [None]:
%%time
model.fit(X, target)

Now the Model is trained we can make predictions on the test set

In [None]:
test_preds = model.predict(X_test)

In [None]:
submission_df

In [None]:
submission_df['Sales'] = test_preds

In [None]:
test_df.Open.isna().sum()

In [None]:
test_df[test_df.Open.isna()]

Best way to fill the NAN values is by replacing them with 1 since we don't have any conclusive evidence that the Store was open or not

In [None]:
submission_df['Sales'] = submission_df['Sales'] * test_df.Open.fillna(1.)

In [None]:
submission_df.sample(20)

Saving the file for submission