# Importing Modules

In [None]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')

# Loading Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Creating Data Frame

In [None]:
# Train Dat
pd.read_csv("../input/demand-forecasting-kernels-only/train.csv")

In [None]:
train = pd.read_csv("../input/demand-forecasting-kernels-only/train.csv", parse_dates=['date'])
test = pd.read_csv("../input/demand-forecasting-kernels-only/test.csv", parse_dates=['date'])

In [None]:
train.shape

In [None]:
test.shape

In [None]:
sample_sub = pd.read_csv("../input/demand-forecasting-kernels-only/sample_submission.csv")
sample_sub.head()

In [None]:
# Concating Train data and Test data
df = pd.concat([train, test], sort=False)
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

# EDA

In [None]:
df['date'].min() ,df['date'].max()

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(train)

In [None]:
check_df(test)

In [None]:
check_df(sample_sub)

In [None]:
check_df(df)

In [None]:
df["sales"].describe([0.10, 0.30, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99])

* Checking Unique values

In [None]:
df[["store"]].nunique()

In [None]:
df[["item"]].nunique()

In [None]:
df.groupby(["store"])["item"].nunique()

In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum"]})

In [None]:
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

# Feature Engineering

# Date Features

In [None]:
df['date'].head()

In [None]:
def create_date_features(df):
    df['month'] = df.date.dt.month
    df['quarter'] = df.date.dt.quarter
    df['is_q_end'] = df.date.dt.is_quarter_end.astype(int)
    df['is_q_start'] = df.date.dt.is_quarter_start.astype(int)
    df['days_in_month'] = df.date.dt.days_in_month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    return df

In [None]:
df = create_date_features(df)
df.head()

In [None]:
# Checking Sales iteams
df.groupby(["store", "item", "year","month"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:
# we can see the sales statistics in the store-item-quarter breakdown, where there is month information
df.groupby(["store", "item", "year","quarter"]).agg({"sales": ["sum", "mean", "median", "std"]})

# Random Noise

In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

# Lag/Shifted Features

In [None]:
df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

In [None]:
check_df(df)

# Checking top-10 observations

In [None]:
df["sales"].head(10)

In [None]:
# First delay
df["sales"].shift(1).values[0:10]

In [None]:
# Second Delay
df["sales"].shift(2).values[0:10]

In [None]:
# Third delay
df["sales"].shift(3).values[0:10]

In [None]:
# Lets check to replace in df
pd.DataFrame({"sales": df["sales"].values[0:10],
              "lag1": df["sales"].shift(1).values[0:10],
              "lag2": df["sales"].shift(2).values[0:10],
              "lag3": df["sales"].shift(3).values[0:10],
              "lag4": df["sales"].shift(4).values[0:10]})

In [None]:
df.groupby(["store", "item"])['sales'].head()

In [None]:
df.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(1))

In [None]:
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [None]:
df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 456,546, 728,821])
check_df(df)

In [None]:
# Checking Null values in sales data
df[df["sales"].isnull()]

# Rolling Mean Features

In [None]:
df["sales"].head(10)

In [None]:
df["sales"].rolling(window=2).mean().values[0:10]

In [None]:
df["sales"].rolling(window=3).mean().values[0:10]

In [None]:
df["sales"].rolling(window=5).mean().values[0:10]

In [None]:
pd.DataFrame({"sales": df["sales"].values[0:10],
              "roll2": df["sales"].rolling(window=2).mean().values[0:10],
              "roll3": df["sales"].rolling(window=3).mean().values[0:10],
              "roll5": df["sales"].rolling(window=5).mean().values[0:10]})

In [None]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

In [None]:
df = roll_mean_features(df, [365, 456,546])

# Exponentially Weighted Mean Features

In [None]:
pd.DataFrame({"sales": df["sales"].values[0:10],
              "roll2": df["sales"].shift(1).rolling(window=2).mean().values[0:10],
              "ewm099": df["sales"].shift(1).ewm(alpha=0.99).mean().values[0:10],
              "ewm095": df["sales"].shift(1).ewm(alpha=0.95).mean().values[0:10],
              "ewm07": df["sales"].shift(1).ewm(alpha=0.7).mean().values[0:10],
              "ewm02": df["sales"].shift(1).ewm(alpha=0.1).mean().values[0:10]})

In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

In [None]:
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365,456,546, 728,821]

In [None]:
df = ewm_features(df, alphas, lags)
check_df(df)

# One-Hot-Encoding

In [None]:
df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])

# Converting sales to log(1+sales)

In [None]:
df['sales'] = np.log1p(df["sales"].values)
check_df(df)

# Model
* Custom Cost Function
* MAE: mean absolute error
* MAPE: mean absolute percentage error
* SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)

In [None]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

# Time-Based Validation Sets

In [None]:
#  Train set the beginning of 2017 (until the end of 2016)
train = df.loc[(df["date"] < "2017-01-01"), :]

In [None]:
# First 3 months of 2017 validation kit
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]
val.head()

In [None]:
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

In [None]:
Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

In [None]:
# Control
Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

# LightBGM Model

In [None]:
lgb_params =  { 'metric': 'mae',
                'num_leaves' : 12,
                'max_depth': 10,
                'min_child_samples': 5,
                'learning_rate': 0.03,
                'colsample_bytree': 0.5,
                'verbose': 0,
                'num_boost_round': 2000,
                'early_stopping_rounds': 200,
                'min_child_weight' : 0.1,
                'nthread': -1}

* metric mae: l1, absolute loss, mean_absolute_error, regression_l1
* l2, square loss, mean_squared_error, mse, regression_l2, regression
* rmse, root square loss, root_mean_squared_error, l2_root
* mape, MAPE loss, mean_absolute_percentage_error

In [None]:
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

# Variable severity levels

In [None]:
def plot_lgb_importances(model, plot=False, num=10):

    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))

In [None]:
plot_lgb_importances(model, num=30, plot=True)

In [None]:
lgb.plot_importance(model, max_num_features=20, figsize=(10, 10), importance_type="gain")
plt.show()

# Model Deployment

In [None]:
train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

In [None]:
lgb_params = { 'metric': 'mae',
                'num_leaves' : 12,
                'max_depth': 10,
                'min_child_samples': 5,
                'learning_rate': 0.03,
                'colsample_bytree': 0.5,
                'verbose': 0,
                'min_child_weight' : 0.1,
                'nthread': -1,
              "num_boost_round": 2000}

In [None]:
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

In [None]:
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

# Model Submission

In [None]:
submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df.head(20)