In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Farklı store için 3 aylık item-level sales tahmini.
# 5 yıllık bir veri setinde 10 farklı mağaza ve 50 farklı item var.
# Buna göre mağaza-item kırılımında 3 ay sonrasının tahminlerini vermemiz gerekiyor.

import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')


def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)


# Loading the data


In [None]:
train = pd.read_csv("../input/demand-forecasting-kernels-only/train.csv", parse_dates=['date'])
test = pd.read_csv("../input/demand-forecasting-kernels-only/test.csv", parse_dates=['date'])
sample_sub = pd.read_csv("../input/demand-forecasting-kernels-only/sample_submission.csv")
df = pd.concat([train, test], sort=False)

In [None]:
df.head()

# EDA


In [None]:
df["date"].min(), df["date"].max()

In [None]:
check_df(train)

In [None]:
check_df(test)

In [None]:
check_df(sample_sub)

In [None]:
check_df(df)

In [None]:
# Satış dağılımı nasıl?
df["sales"].describe([0.10, 0.30, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99])

In [None]:
# Kaç store var?
df[["store"]].nunique()

In [None]:
# Kaç item var?
df[["item"]].nunique()

In [None]:
# Her store'da eşit sayıda mı eşsiz item var?
df.groupby(["store"])["item"].nunique()

In [None]:
# Peki her store'da eşit sayıda mı sales var?
df.groupby(["store", "item"]).agg({"sales": ["sum"]})

In [None]:
# mağaza-item kırılımında satış istatistikleri
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

# FEATURE ENGINEERING

# Date Features

In [None]:
def create_date_features(df):
    df['month'] = df.date.dt.month
    df['quarter'] = df.date.dt.quarter
    df['is_q_end'] = df.date.dt.is_quarter_end.astype(int)
    df['is_q_start'] = df.date.dt.is_quarter_start.astype(int)
    df['days_in_month'] = df.date.dt.days_in_month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    return df


In [None]:

df = create_date_features(df)


In [None]:
# Şu an ay bilgisi olduğu mesela store-item-month kırılımında satış istatistiklerini görebiliriz.
df.groupby(["store", "item", "year","month"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:
# Şu an ay bilgisi olduğu mesela store-item-quarter kırılımında satış istatistiklerini görebiliriz.
df.groupby(["store", "item", "year","quarter"]).agg({"sales": ["sum", "mean", "median", "std"]})

# Random Noise

In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

# Lag/Shifted Features

In [None]:
df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

In [None]:

check_df(df)

In [None]:
# satışın ilk 10 gözlemine bakalım:
df["sales"].head(10)

In [None]:
# Birinci gecikme
df["sales"].shift(1).values[0:10]


In [None]:
# İkinci gecikme
df["sales"].shift(2).values[0:10]


In [None]:
# Üçüncü gecikme
df["sales"].shift(3).values[0:10]

In [None]:
# Daha anlaşılır olması için df'te bir arada ele alalım:
pd.DataFrame({"sales": df["sales"].values[0:10],
              "lag1": df["sales"].shift(1).values[0:10],
              "lag2": df["sales"].shift(2).values[0:10],
              "lag3": df["sales"].shift(3).values[0:10],
              "lag4": df["sales"].shift(4).values[0:10]})

In [None]:
df.groupby(["store", "item"])['sales'].head()

In [None]:
df.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(1))

In [None]:
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [None]:
df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 456,546, 728,821])
check_df(df)

In [None]:
df[df["sales"].isnull()]

# Rolling Mean Features

In [None]:
df["sales"].head(10)

In [None]:
df["sales"].rolling(window=2).mean().values[0:10]

In [None]:
df["sales"].rolling(window=3).mean().values[0:10]

In [None]:
df["sales"].rolling(window=5).mean().values[0:10]

In [None]:
pd.DataFrame({"sales": df["sales"].values[0:10],
              "roll2": df["sales"].rolling(window=2).mean().values[0:10],
              "roll3": df["sales"].rolling(window=3).mean().values[0:10],
              "roll5": df["sales"].rolling(window=5).mean().values[0:10]})

In [None]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

In [None]:
df = roll_mean_features(df, [365, 456,546])

# Exponentially Weighted Mean Features

In [None]:
pd.DataFrame({"sales": df["sales"].values[0:10],
              "roll2": df["sales"].shift(1).rolling(window=2).mean().values[0:10],
              "ewm099": df["sales"].shift(1).ewm(alpha=0.99).mean().values[0:10],
              "ewm095": df["sales"].shift(1).ewm(alpha=0.95).mean().values[0:10],
              "ewm07": df["sales"].shift(1).ewm(alpha=0.7).mean().values[0:10],
              "ewm02": df["sales"].shift(1).ewm(alpha=0.1).mean().values[0:10]})

In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe


In [None]:

alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365,456,546, 728,821]

In [None]:
df = ewm_features(df, alphas, lags)
check_df(df)

# One-Hot Encoding

In [None]:
df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])

# Converting sales to log(1+sales)

In [None]:
df['sales'] = np.log1p(df["sales"].values)
check_df(df)

# Model

# Custom Cost Function

# MAE: mean absolute error
# MAPE: mean absolute percentage error
# SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)

In [None]:

def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False


# Time-Based Validation Sets

In [None]:
# 2017'nin başına kadar (2016'nın sonuna kadar) train seti.
train = df.loc[(df["date"] < "2017-01-01"), :]

In [None]:
# 2017'nin ilk 3'ayı validasyon seti.
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]

In [None]:
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

In [None]:
Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

In [None]:
# kontrol
Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

# LightGBM Model

In [None]:
lgb_params =  { 'metric': 'mae',
                'num_leaves' : 12,
                'max_depth': 10,
                'min_child_samples': 5,
                'learning_rate': 0.03,
                'colsample_bytree': 0.5,
                'verbose': 0,
                'num_boost_round': 2000,
                'early_stopping_rounds': 200,
                'min_child_weight' : 0.1,
                'nthread': -1}

In [None]:
# metric mae: l1, absolute loss, mean_absolute_error, regression_l1
# l2, square loss, mean_squared_error, mse, regression_l2, regression
# rmse, root square loss, root_mean_squared_error, l2_root
# mape, MAPE loss, mean_absolute_percentage_error


In [None]:

lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

# Değişken önem düzeyleri

In [None]:
def plot_lgb_importances(model, plot=False, num=10):

    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))

In [None]:
plot_lgb_importances(model, num=30, plot=True)

In [None]:
lgb.plot_importance(model, max_num_features=20, figsize=(10, 10), importance_type="gain")
plt.show()

# Final Model

In [None]:
train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

In [None]:
lgb_params = { 'metric': 'mae',
                'num_leaves' : 12,
                'max_depth': 10,
                'min_child_samples': 5,
                'learning_rate': 0.03,
                'colsample_bytree': 0.5,
                'verbose': 0,
                'min_child_weight' : 0.1,
                'nthread': -1,
              "num_boost_round": 2000}


In [None]:
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

In [None]:
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

In [None]:
submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv('submission.csv', index=False)


In [None]:
submission_df.head(20)