In [None]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('/kaggle/input/demand-forecasting-kernels-only/sample_submission.csv')
df = pd.concat([train, test], sort=False)

In [None]:
train.head(2),test.head(2),sample_sub.head(2), df.head(2)

# Date Features

In [None]:
df['month'] = df.date.dt.month
df.loc[(df["month"] >= 1) & (df["month"] < 4), "quantile_of_year"] = 1
df.loc[(df["month"] >= 4) & (df["month"] < 7), "quantile_of_year"] = 2
df.loc[(df["month"] >= 7) & (df["month"] < 10), "quantile_of_year"] = 3
df.loc[(df["month"] >= 10) & (df["month"] < 13), "quantile_of_year"] = 4
df['day_of_month'] = df.date.dt.day
df.loc[(df["day_of_month"] >= 1) & (df["day_of_month"] < 8), "first_week"] = 1
df["first_week"].fillna(0)
df['day_of_year'] = df.date.dt.dayofyear
df['week_of_year'] = df.date.dt.weekofyear
df['day_of_week'] = df.date.dt.dayofweek+1
df['year'] = df.date.dt.year
df["is_wknd"] = df.date.dt.weekday // 4
df['is_month_start'] = df.date.dt.is_month_start.astype(int)
df['is_month_end'] = df.date.dt.is_month_end.astype(int)
df.head(2)

# Lag/Shifted Features

In [None]:
df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe)))
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe
df = lag_features(df, [91, 98, 105, 112, 119, 126, 182, 364, 546, 728])

# Rolling Mean Features

In [None]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe
df = roll_mean_features(df, [182, 365, 546])

# Exponentially Weighted Mean Features

In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]
df = ewm_features(df, alphas, lags)

# Encoding

In [None]:
df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])

# Scaling

In [None]:
df['sales'] = np.log1p(df["sales"].values)

In [None]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val
def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [None]:
train = df.loc[(df["date"] < "2017-01-01"), :]
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]
Y_train = train['sales']
X_train = train[cols]
Y_val = val['sales']
X_val = val[cols]

In [None]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 10, # maksimum yaprak sayısı
              'learning_rate': 0.02,
              'feature_fraction': 0.8, # rf'nin random subspace özelliği
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 20000, # n_estimators, number of boosting iterations
              'early_stopping_rounds': 200, # 200 iterasyon boyunca hata değeri küçülmüyorsa, durdur, küçülüyorsa num_boost_round miktarı kadar devam et
              'nthread': -1}
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=200)

In [None]:
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
smape(np.expm1(y_pred_val), np.expm1(Y_val))

In [None]:
train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]
test = df.loc[df.sales.isna()]
X_test = test[cols]

In [None]:
lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'nthread': -1,
              "num_boost_round": model.best_iteration}


lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv('submission.csv', index=False)