# <span style="color:crimson;"> Store Item Demand Forecasting Challenge

* https://www.kaggle.com/c/demand-forecasting-kernels-only
* 3-month item-level sales forecast for different store.
* There are 10 different stores and 50 different items in a 5-year dataset.
* Accordingly, we need to give the forecasts for 3 months after the store-item breakdown.

In [None]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')

# <span style="color:crimson;">Loading the data

In [None]:
train = pd.read_csv('../input/demand-forecasting-kernels-only/train.csv', parse_dates=['date'])
test = pd.read_csv('../input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('../input/demand-forecasting-kernels-only/sample_submission.csv')
df = pd.concat([train, test], sort=False)

# <span style="color:crimson;"> Explanatory Data Analysis

In [None]:
df["date"].min(), df["date"].max()


In [None]:

def check_df(dataframe, head=5, tail=5, quan=False):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(tail))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())

    if quan:
        print("##################### Quantiles #####################")
        print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(train)
check_df(test)


In [None]:
df.groupby(["store"])["item"].nunique()

In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns
missing_values_table(df, na_name=True)

# <span style="color:crimson;">Feature Engineering

In [None]:

def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    df.loc[(df['day_of_month'] >= 1) & (df['day_of_month'] <= 7),"first_week"] = 1
    df.loc[(df['day_of_month'] >= 28),'last_week'] = 1
    df["first_week"].fillna(0)
    df["last_week"].fillna(0)
    return df
df = create_date_features(df)
check_df(df)



In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

# <span style="color:crimson;">Lag/Shifted Features

In [None]:


df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

check_df(df)

In [None]:

def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe


df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 546, 728])


# <span style="color:crimson;">Rolling Mean Features


*  This method is used in creating moving averages for specified time intervals.
*  Here we take the number of time given
*  as window parameter and takes the average of the values, but one of
*  the values is the value on this specific observation. In order to eliminate
*  today's affect on moving average values, I will take 1 shift and use this f

In [None]:

def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe


df = roll_mean_features(df, [91,182,365, 546])
df.tail()


# <span style="color:crimson;">Exponentially Weighted Mean Features

* The value in time t highly depends on the value in time t-1,
* so in order to have a better prediction, while computing the average value,
*  we set higher weights to the recent time.

In [None]:
def ewm_features(dataframe, alphas, lags):
    dataframe = dataframe.copy()
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales']. \
                    transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

In [None]:
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)
df.tail()

# <span style="color:crimson;">One-Hot Encoding

In [None]:
df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])

In [None]:
#Converting sales to log(1+sales)
df['sales'] = np.log1p(df["sales"].values)


# <span style="color:crimson;">LigthGBM Model

In [None]:
# Train set until the beginning of 2017 (until the end of 2016).
train = df.loc[(df["date"] < "2017-01-01"), :]
train["date"].min(), train["date"].max()

# validation first three months of 2017
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

# columns with no useful information or with information that is already derived will be dropped.
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]


# <span style="color:crimson;">Custom Cost Function

In [None]:

def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val


def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False


Here
* If the model begins to memorize the train dataset instead of learning it,
* the error will get lower but the model won't be able to have a good prediction
* of the validation set (because it didn't learn the patterns) so the error in validation will begin to increase.


In [None]:
# Variables defined for train set
Y_train = train['sales']
X_train = train[cols]

# Variables defined for validation set
Y_val = val['sales']
X_val = val[cols]

# we checked the shapes of them
Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

In [None]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 15000,
              'early_stopping_rounds': 200,
              'nthread': -1}


lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

In [None]:

model = lgb.train(lgb_params, lgbtrain,  # parameters
                  valid_sets=[lgbtrain, lgbval],  # data
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)  # report for each 100 iteration


In [None]:
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
# Final Model
#####################################################
# train and validation values are concatenated

train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]




In [None]:

lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'nthread': -1,
              "num_boost_round": model.best_iteration}

lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
final_model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)
test_preds = final_model.predict(X_test, num_iteration=model.best_iteration)


In [None]:

submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv('submission.csv', index=False)